Skip to content

Commit

Permalink
Update fill function for CUDA matrix (#626)
Browse files Browse the repository at this point in the history
* update the fill function for cuda matrix to avoid data transfer

* fix compilation errors

* add a comment about template function

* update fill function for cuda sparse matrix

* remove gcc11 CI test and add gcc14 CI test
  • Loading branch information
sjsprecious authored Aug 21, 2024
1 parent 649cb0e commit 2062687
Show file tree
Hide file tree
Showing 6 changed files with 52 additions and 9 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/mac.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ jobs:
strategy:
matrix:
compiler:
- { cpp: g++-11, c: gcc-11}
- { cpp: g++-14, c: gcc-14}
- { cpp: g++-12, c: gcc-12}
- { cpp: clang++, c: clang}
build_type: [Release]
Expand Down
11 changes: 9 additions & 2 deletions include/micm/cuda/util/cuda_dense_matrix.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,8 +229,15 @@ namespace micm
/// @param val Value to set each element to
void Fill(T val)
{
std::fill(this->data_.begin(), this->data_.end(), val);
CHECK_CUDA_ERROR(micm::cuda::CopyToDevice(this->param_, this->data_), "cudaMemcpyHostToDevice");
if constexpr (std::is_same_v<T, int>)
{
// the cudaMemset function only works for integer types and is an asynchronous function: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1gf7338650f7683c51ee26aadc6973c63a
CHECK_CUDA_ERROR(cudaMemset(this->param_.d_data_, val, sizeof(double) * this->param_.number_of_elements_), "cudaMemset");
}
else
{
CHECK_CUDA_ERROR(micm::cuda::FillCudaMatrix<T>(this->param_, val), "FillCudaMatrix");
}
}

}; // class CudaDenseMatrix
Expand Down
7 changes: 7 additions & 0 deletions include/micm/cuda/util/cuda_matrix.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,12 @@ namespace micm
/// @param vectorMatrixSrc Struct containing allocated source device memory to copy from
/// @returns Error code from copying to destination device memory from source device memory, if any
cudaError_t CopyToDeviceFromDevice(CudaMatrixParam& vectorMatrixDest, const CudaMatrixParam& vectorMatrixSrc);


/// @brief Fills a CUDA matrix with a specified value
/// @param param Struct containing allocated device memory
/// @param val Value to fill the matrix with
template<typename T>
cudaError_t FillCudaMatrix(CudaMatrixParam& param, T val);
} // namespace cuda
} // namespace micm
11 changes: 9 additions & 2 deletions include/micm/cuda/util/cuda_sparse_matrix.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,15 @@ namespace micm
/// @param val Value to set each element to
void Fill(T val)
{
std::fill(this->data_.begin(), this->data_.end(), val);
CHECK_CUDA_ERROR(micm::cuda::CopyToDevice(this->param_, this->data_), "cudaMemcpyHostToDevice");
if constexpr (std::is_same_v<T, int>)
{
// the cudaMemset function only works for integer types and is an asynchronous function: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1gf7338650f7683c51ee26aadc6973c63a
CHECK_CUDA_ERROR(cudaMemset(this->param_.d_data_, val, sizeof(double) * this->param_.number_of_elements_), "cudaMemset");
}
else
{
CHECK_CUDA_ERROR(micm::cuda::FillCudaMatrix<T>(this->param_, val), "FillCudaMatrix");
}
}

CudaMatrixParam AsDeviceParam() const
Expand Down
6 changes: 3 additions & 3 deletions include/micm/util/sparse_matrix_standard_ordering.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ namespace micm
const std::vector<std::size_t>& row_start)
{
return number_of_blocks * row_ids.size();
};
}

static std::size_t VectorIndex(
std::size_t number_of_blocks,
Expand All @@ -43,7 +43,7 @@ namespace micm
if (elem == end)
throw std::system_error(make_error_code(MicmMatrixErrc::ZeroElementAccess));
return std::size_t{ (elem - row_ids.begin()) + block * row_ids.size() };
};
}

static void AddToDiagonal(
const std::vector<std::size_t>& diagonal_ids,
Expand All @@ -55,6 +55,6 @@ namespace micm
for (std::size_t block_start = 0; block_start < number_of_blocks * block_size; block_start += block_size)
for (const auto& i : diagonal_ids)
data[block_start + i] += value;
};
}
};
} // namespace micm
24 changes: 23 additions & 1 deletion src/util/cuda_matrix.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// SPDX-License-Identifier: Apache-2.0
#include <micm/cuda/util/cuda_matrix.cuh>
#include <micm/util/internal_error.hpp>

#include <micm/cuda/util/cuda_param.hpp>
#include <cuda_runtime.h>

#include <vector>
Expand Down Expand Up @@ -55,5 +55,27 @@ namespace micm
cudaMemcpyDeviceToDevice);
return err;
}

template<typename T>
__global__ void FillCudaMatrixKernel(T* d_data, std::size_t number_of_elements, T val)
{
std::size_t tid = blockIdx.x * BLOCK_SIZE + threadIdx.x;
if (tid < number_of_elements)
{
d_data[tid] = val;
}
}

template<typename T>
cudaError_t FillCudaMatrix(CudaMatrixParam& param, T val)
{
std::size_t number_of_blocks = (param.number_of_elements_ + BLOCK_SIZE - 1) / BLOCK_SIZE;
FillCudaMatrixKernel<<<number_of_blocks, BLOCK_SIZE>>>(param.d_data_, param.number_of_elements_, val);
cudaError_t err = cudaGetLastError();
return err;
}

// source code needs the instantiation of the template
template cudaError_t FillCudaMatrix<double>(CudaMatrixParam& param, double val);
} // namespace cuda
} // namespace micm

0 comments on commit 2062687

Please sign in to comment.