Skip to content

Commit

Permalink
Remove data transfer in cuda matrix constructor and template some CUD…
Browse files Browse the repository at this point in the history
…A functions (#630)

* remove data transfer in the cuda dense matrix constructor

* template many cuda functions for cuda dense and sparse matrix
  • Loading branch information
sjsprecious authored Aug 23, 2024
1 parent 18c855f commit be4b2d8
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 91 deletions.
64 changes: 22 additions & 42 deletions include/micm/cuda/util/cuda_dense_matrix.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,44 +68,36 @@ namespace micm
CudaMatrixParam param_;

public:
CudaDenseMatrix() requires(std::is_same_v<T, double>)

CudaDenseMatrix()
: VectorMatrix<T, L>()
{
this->param_.number_of_grid_cells_ = 0;
this->param_.number_of_elements_ = this->data_.size();
CHECK_CUDA_ERROR(micm::cuda::MallocVector(this->param_, this->param_.number_of_elements_), "cudaMalloc");
}
CudaDenseMatrix()
: VectorMatrix<T, L>()
{
CHECK_CUDA_ERROR(micm::cuda::MallocVector<T>(this->param_, this->param_.number_of_elements_), "cudaMalloc");
}

CudaDenseMatrix(std::size_t x_dim, std::size_t y_dim) requires(std::is_same_v<T, double>)
CudaDenseMatrix(std::size_t x_dim, std::size_t y_dim)
: VectorMatrix<T, L>(x_dim, y_dim)
{
this->param_.number_of_elements_ = this->data_.size();
this->param_.number_of_grid_cells_ = x_dim;
CHECK_CUDA_ERROR(micm::cuda::MallocVector(this->param_, this->param_.number_of_elements_), "cudaMalloc");
}
CudaDenseMatrix(std::size_t x_dim, std::size_t y_dim)
: VectorMatrix<T, L>(x_dim, y_dim)
{
CHECK_CUDA_ERROR(micm::cuda::MallocVector<T>(this->param_, this->param_.number_of_elements_), "cudaMalloc");
}

CudaDenseMatrix(std::size_t x_dim, std::size_t y_dim, T initial_value) requires(std::is_same_v<T, double>)
CudaDenseMatrix(std::size_t x_dim, std::size_t y_dim, T initial_value)
: VectorMatrix<T, L>(x_dim, y_dim, initial_value)
{
this->param_.number_of_elements_ = this->data_.size();
this->param_.number_of_grid_cells_ = x_dim;
CHECK_CUDA_ERROR(micm::cuda::MallocVector(this->param_, this->param_.number_of_elements_), "cudaMalloc");
CHECK_CUDA_ERROR(micm::cuda::CopyToDevice(this->param_, this->data_), "cudaMemcpyHostToDevice");
}
CudaDenseMatrix(std::size_t x_dim, std::size_t y_dim, T initial_value)
: VectorMatrix<T, L>(x_dim, y_dim, initial_value)
{
if (this->param_.number_of_elements_ != 0)
{
CHECK_CUDA_ERROR(micm::cuda::MallocVector<T>(this->param_, this->param_.number_of_elements_), "cudaMalloc");
Fill(initial_value);
}
}

CudaDenseMatrix(const std::vector<std::vector<T>> other) requires(std::is_same_v<T, double>)
CudaDenseMatrix(const std::vector<std::vector<T>> other)
: VectorMatrix<T, L>(other)
{
this->param_.number_of_grid_cells_ = 0;
Expand All @@ -114,28 +106,18 @@ namespace micm
{
this->param_.number_of_elements_ += inner_vector.size();
}
CHECK_CUDA_ERROR(micm::cuda::MallocVector(this->param_, this->param_.number_of_elements_), "cudaMalloc");
}

CudaDenseMatrix(const std::vector<std::vector<T>> other)
: VectorMatrix<T, L>(other)
{
CHECK_CUDA_ERROR(micm::cuda::MallocVector<T>(this->param_, this->param_.number_of_elements_), "cudaMalloc");
}

CudaDenseMatrix(const CudaDenseMatrix& other) requires(std::is_same_v<T, double>)
CudaDenseMatrix(const CudaDenseMatrix& other)
: VectorMatrix<T, L>(other)
{
this->param_ = other.param_;
this->param_.d_data_ = nullptr;
this->param_.number_of_elements_ = other.param_.number_of_elements_;
this->param_.number_of_grid_cells_ = other.param_.number_of_grid_cells_;
CHECK_CUDA_ERROR(micm::cuda::MallocVector(this->param_, this->param_.number_of_elements_), "cudaMalloc");
CHECK_CUDA_ERROR(micm::cuda::CopyToDeviceFromDevice(this->param_, other.param_), "cudaMemcpyDeviceToDevice");
}

CudaDenseMatrix(const CudaDenseMatrix& other)
: VectorMatrix<T, L>(other)
{
CHECK_CUDA_ERROR(micm::cuda::MallocVector<T>(this->param_, this->param_.number_of_elements_), "cudaMalloc");
CHECK_CUDA_ERROR(micm::cuda::CopyToDeviceFromDevice<T>(this->param_, other.param_), "cudaMemcpyDeviceToDevice");
}

CudaDenseMatrix(CudaDenseMatrix&& other) noexcept
Expand All @@ -151,8 +133,8 @@ namespace micm
if (this->param_.d_data_ != nullptr)
CHECK_CUDA_ERROR(micm::cuda::FreeVector(this->param_), "cudaFree");
this->param_ = other.param_;
CHECK_CUDA_ERROR(micm::cuda::MallocVector(this->param_, this->param_.number_of_elements_), "cudaMalloc");
CHECK_CUDA_ERROR(micm::cuda::CopyToDeviceFromDevice(this->param_, other.param_), "cudaMemcpyDeviceToDevice");
CHECK_CUDA_ERROR(micm::cuda::MallocVector<T>(this->param_, this->param_.number_of_elements_), "cudaMalloc");
CHECK_CUDA_ERROR(micm::cuda::CopyToDeviceFromDevice<T>(this->param_, other.param_), "cudaMemcpyDeviceToDevice");
return *this;
}

Expand All @@ -178,14 +160,12 @@ namespace micm

void CopyToDevice()
{
static_assert(std::is_same_v<T, double>);
CHECK_CUDA_ERROR(micm::cuda::CopyToDevice(this->param_, this->data_), "cudaMemcpyHostToDevice");
CHECK_CUDA_ERROR(micm::cuda::CopyToDevice<T>(this->param_, this->data_), "cudaMemcpyHostToDevice");
}

void CopyToHost()
{
static_assert(std::is_same_v<T, double>);
CHECK_CUDA_ERROR(micm::cuda::CopyToHost(this->param_, this->data_), "cudaMemcpyDeviceToHost");
CHECK_CUDA_ERROR(micm::cuda::CopyToHost<T>(this->param_, this->data_), "cudaMemcpyDeviceToHost");
}

CudaMatrixParam AsDeviceParam() const
Expand Down Expand Up @@ -222,7 +202,7 @@ namespace micm
{
throw std::runtime_error("Both CUDA dense matrices must have the same size.");
}
CHECK_CUDA_ERROR(micm::cuda::CopyToDeviceFromDevice(this->param_, other.param_), "cudaMemcpyDeviceToDevice");
CHECK_CUDA_ERROR(micm::cuda::CopyToDeviceFromDevice<T>(this->param_, other.param_), "cudaMemcpyDeviceToDevice");
}

/// @brief Set every matrix element to a given value on the GPU
Expand All @@ -234,7 +214,7 @@ namespace micm
// the cudaMemset function only works for integer types and is an asynchronous function:
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1gf7338650f7683c51ee26aadc6973c63a
CHECK_CUDA_ERROR(
cudaMemset(this->param_.d_data_, val, sizeof(double) * this->param_.number_of_elements_), "cudaMemset");
cudaMemset(this->param_.d_data_, val, sizeof(T) * this->param_.number_of_elements_), "cudaMemset");
}
else
{
Expand Down
8 changes: 6 additions & 2 deletions include/micm/cuda/util/cuda_matrix.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ namespace micm
/// @param vectorMatrix Reference to struct containing information about allocated memory
/// @param num_elements Requested number of elements to allocate
/// @returns Error code from allocating data on the device, if any
template<typename T>
cudaError_t MallocVector(CudaMatrixParam& vectorMatrix, std::size_t num_elements);

/// @brief Free memory allocated on device
Expand All @@ -27,18 +28,21 @@ namespace micm
/// @param vectorMatrix Struct containing allocated device memory
/// @param h_data Host data to copy from
/// @returns Error code from copying to device from the host, if any
cudaError_t CopyToDevice(CudaMatrixParam& vectorMatrix, std::vector<double>& h_data);
template<typename T>
cudaError_t CopyToDevice(CudaMatrixParam& vectorMatrix, std::vector<T>& h_data);

/// @brief Copies data from the device to the host
/// @param vectorMatrix Struct containing allocated device memory
/// @param h_data Host data to copy data to
/// @returns Error code from copying from the device to the host, if any
cudaError_t CopyToHost(CudaMatrixParam& vectorMatrix, std::vector<double>& h_data);
template<typename T>
cudaError_t CopyToHost(CudaMatrixParam& vectorMatrix, std::vector<T>& h_data);

/// @brief Copies data to the destination device memory block from the source device memory block
/// @param vectorMatrixDest Struct containing allocated destination device memory to copy to
/// @param vectorMatrixSrc Struct containing allocated source device memory to copy from
/// @returns Error code from copying to destination device memory from source device memory, if any
template<typename T>
cudaError_t CopyToDeviceFromDevice(CudaMatrixParam& vectorMatrixDest, const CudaMatrixParam& vectorMatrixSrc);

/// @brief Fills a CUDA matrix with a specified value
Expand Down
50 changes: 12 additions & 38 deletions include/micm/cuda/util/cuda_sparse_matrix.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,47 +32,28 @@ namespace micm
this->param_.d_data_ = nullptr;
}

CudaSparseMatrix(const SparseMatrixBuilder<T, OrderingPolicy>& builder) requires(std::is_same_v<T, double>)
: SparseMatrix<T, OrderingPolicy>(builder)
{
this->param_.number_of_grid_cells_ = this->number_of_blocks_;
CHECK_CUDA_ERROR(micm::cuda::MallocVector(this->param_, this->data_.size()), "cudaMalloc");
}
CudaSparseMatrix(const SparseMatrixBuilder<T, OrderingPolicy>& builder)
: SparseMatrix<T, OrderingPolicy>(builder)
{
this->param_.d_data_ = nullptr;
}

CudaSparseMatrix<T, OrderingPolicy>& operator=(const SparseMatrixBuilder<T, OrderingPolicy>& builder) requires(
std::is_same_v<T, double>)
{
SparseMatrix<T, OrderingPolicy>::operator=(builder);
this->param_.number_of_grid_cells_ = this->number_of_blocks_;
CHECK_CUDA_ERROR(micm::cuda::MallocVector(this->param_, this->data_.size()), "cudaMalloc");
return *this;
CHECK_CUDA_ERROR(micm::cuda::MallocVector<T>(this->param_, this->data_.size()), "cudaMalloc");
}

CudaSparseMatrix<T, OrderingPolicy>& operator=(const SparseMatrixBuilder<T, OrderingPolicy>& builder)
{
SparseMatrix<T, OrderingPolicy>::operator=(builder);
this->param_.d_data_ = nullptr;
this->param_.number_of_grid_cells_ = this->number_of_blocks_;
CHECK_CUDA_ERROR(micm::cuda::MallocVector<T>(this->param_, this->data_.size()), "cudaMalloc");
return *this;
}

CudaSparseMatrix(const CudaSparseMatrix& other) requires(std::is_same_v<T, double>)
: SparseMatrix<T, OrderingPolicy>(other)
{
this->param_ = other.param_;
this->param_.d_data_ = nullptr;
CHECK_CUDA_ERROR(micm::cuda::MallocVector(this->param_, this->data_.size()), "cudaMalloc");
CHECK_CUDA_ERROR(micm::cuda::CopyToDeviceFromDevice(this->param_, other.param_), "cudaMemcpyDeviceToDevice");
}

CudaSparseMatrix(const CudaSparseMatrix& other)
: SparseMatrix<T, OrderingPolicy>(other)
{
this->param_ = other.param_;
this->param_.d_data_ = nullptr;
CHECK_CUDA_ERROR(micm::cuda::MallocVector<T>(this->param_, this->data_.size()), "cudaMalloc");
CHECK_CUDA_ERROR(micm::cuda::CopyToDeviceFromDevice<T>(this->param_, other.param_), "cudaMemcpyDeviceToDevice");
}

CudaSparseMatrix(CudaSparseMatrix&& other) noexcept
Expand All @@ -87,8 +68,8 @@ namespace micm
SparseMatrix<T, OrderingPolicy>::operator=(other);
this->param_ = other.param_;
this->param_.d_data_ = nullptr;
CHECK_CUDA_ERROR(micm::cuda::MallocVector(this->param_, this->data_.size()), "cudaMalloc");
CHECK_CUDA_ERROR(micm::cuda::CopyToDeviceFromDevice(this->param_, other.param_), "cudaMemcpyDeviceToDevice");
CHECK_CUDA_ERROR(micm::cuda::MallocVector<T>(this->param_, this->data_.size()), "cudaMalloc");
CHECK_CUDA_ERROR(micm::cuda::CopyToDeviceFromDevice<T>(this->param_, other.param_), "cudaMemcpyDeviceToDevice");
return *this;
}

Expand All @@ -102,27 +83,20 @@ namespace micm
return *this;
}

~CudaSparseMatrix() requires(std::is_same_v<T, double>)
{
CHECK_CUDA_ERROR(micm::cuda::FreeVector(this->param_), "cudaFree");
this->param_.d_data_ = nullptr;
}

~CudaSparseMatrix()
{
CHECK_CUDA_ERROR(micm::cuda::FreeVector(this->param_), "cudaFree");
this->param_.d_data_ = nullptr;
}

void CopyToDevice()
{
static_assert(std::is_same_v<T, double>);
CHECK_CUDA_ERROR(micm::cuda::CopyToDevice(this->param_, this->data_), "cudaMemcpyHostToDevice");
CHECK_CUDA_ERROR(micm::cuda::CopyToDevice<T>(this->param_, this->data_), "cudaMemcpyHostToDevice");
}

void CopyToHost()
{
static_assert(std::is_same_v<T, double>);
CHECK_CUDA_ERROR(micm::cuda::CopyToHost(this->param_, this->data_), "cudaMemcpyDeviceToHost");
CHECK_CUDA_ERROR(micm::cuda::CopyToHost<T>(this->param_, this->data_), "cudaMemcpyDeviceToHost");
}

/// @brief Set every matrix element to a given value on the GPU
Expand All @@ -134,7 +108,7 @@ namespace micm
// the cudaMemset function only works for integer types and is an asynchronous function:
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1gf7338650f7683c51ee26aadc6973c63a
CHECK_CUDA_ERROR(
cudaMemset(this->param_.d_data_, val, sizeof(double) * this->param_.number_of_elements_), "cudaMemset");
cudaMemset(this->param_.d_data_, val, sizeof(T) * this->param_.number_of_elements_), "cudaMemset");
}
else
{
Expand Down
22 changes: 16 additions & 6 deletions src/util/cuda_matrix.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,11 @@ namespace micm
{
namespace cuda
{
template<typename T>
cudaError_t MallocVector(CudaMatrixParam& param, std::size_t number_of_elements)
{
param.number_of_elements_ = number_of_elements;
cudaError_t err = cudaMalloc(&(param.d_data_), sizeof(double) * number_of_elements);
cudaError_t err = cudaMalloc(&(param.d_data_), sizeof(T) * number_of_elements);
return err;
}

Expand All @@ -32,27 +33,30 @@ namespace micm
return err;
}

cudaError_t CopyToDevice(CudaMatrixParam& param, std::vector<double>& h_data)
template<typename T>
cudaError_t CopyToDevice(CudaMatrixParam& param, std::vector<T>& h_data)
{
cudaError_t err =
cudaMemcpy(param.d_data_, h_data.data(), sizeof(double) * param.number_of_elements_, cudaMemcpyHostToDevice);
cudaMemcpy(param.d_data_, h_data.data(), sizeof(T) * param.number_of_elements_, cudaMemcpyHostToDevice);
return err;
}

cudaError_t CopyToHost(CudaMatrixParam& param, std::vector<double>& h_data)
template<typename T>
cudaError_t CopyToHost(CudaMatrixParam& param, std::vector<T>& h_data)
{
cudaDeviceSynchronize();
cudaError_t err =
cudaMemcpy(h_data.data(), param.d_data_, sizeof(double) * param.number_of_elements_, cudaMemcpyDeviceToHost);
cudaMemcpy(h_data.data(), param.d_data_, sizeof(T) * param.number_of_elements_, cudaMemcpyDeviceToHost);
return err;
}

template<typename T>
cudaError_t CopyToDeviceFromDevice(CudaMatrixParam& vectorMatrixDest, const CudaMatrixParam& vectorMatrixSrc)
{
cudaError_t err = cudaMemcpy(
vectorMatrixDest.d_data_,
vectorMatrixSrc.d_data_,
sizeof(double) * vectorMatrixSrc.number_of_elements_,
sizeof(T) * vectorMatrixSrc.number_of_elements_,
cudaMemcpyDeviceToDevice);
return err;
}
Expand All @@ -77,6 +81,12 @@ namespace micm
}

// source code needs the instantiation of the template
template cudaError_t MallocVector<double>(CudaMatrixParam& param, std::size_t number_of_elements);
template cudaError_t MallocVector<int>(CudaMatrixParam& param, std::size_t number_of_elements);
template cudaError_t CopyToDevice<double>(CudaMatrixParam& param, std::vector<double>& h_data);
template cudaError_t CopyToHost<double>(CudaMatrixParam& param, std::vector<double>& h_data);
template cudaError_t CopyToDeviceFromDevice<double>(CudaMatrixParam& vectorMatrixDest, const CudaMatrixParam& vectorMatrixSrc);
template cudaError_t CopyToDeviceFromDevice<int>(CudaMatrixParam& vectorMatrixDest, const CudaMatrixParam& vectorMatrixSrc);
template cudaError_t FillCudaMatrix<double>(CudaMatrixParam& param, double val);
} // namespace cuda
} // namespace micm
6 changes: 3 additions & 3 deletions test/unit/cuda/util/test_cuda_dense_matrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@ TEST(CudaDenseMatrix, DeviceMemCopy)
std::size_t num_elements = h_vector.size();
CudaMatrixParam param;

micm::cuda::MallocVector(param, num_elements);
micm::cuda::CopyToDevice(param, h_vector);
micm::cuda::MallocVector<double>(param, num_elements);
micm::cuda::CopyToDevice<double>(param, h_vector);
micm::cuda::SquareDriver(param);
micm::cuda::CopyToHost(param, h_vector);
micm::cuda::CopyToHost<double>(param, h_vector);

EXPECT_EQ(h_vector[0], 1 * 1);
EXPECT_EQ(h_vector[1], 2 * 2);
Expand Down

0 comments on commit be4b2d8

Please sign in to comment.