From 2062687c4e37b4f83cb50b53a83ebf0bcce9dbc6 Mon Sep 17 00:00:00 2001 From: Jian Sun Date: Wed, 21 Aug 2024 14:32:01 -0600 Subject: [PATCH] Update fill function for CUDA matrix (#626) * update the fill function for cuda matrix to avoid data transfer * fix compilation errors * add a comment about template function * update fill function for cuda sparse matrix * remove gcc11 CI test and add gcc14 CI test --- .github/workflows/mac.yml | 2 +- include/micm/cuda/util/cuda_dense_matrix.hpp | 11 +++++++-- include/micm/cuda/util/cuda_matrix.cuh | 7 ++++++ include/micm/cuda/util/cuda_sparse_matrix.hpp | 11 +++++++-- .../util/sparse_matrix_standard_ordering.hpp | 6 ++--- src/util/cuda_matrix.cu | 24 ++++++++++++++++++- 6 files changed, 52 insertions(+), 9 deletions(-) diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index b32e49372..b047d90cb 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -67,7 +67,7 @@ jobs: strategy: matrix: compiler: - - { cpp: g++-11, c: gcc-11} + - { cpp: g++-14, c: gcc-14} - { cpp: g++-12, c: gcc-12} - { cpp: clang++, c: clang} build_type: [Release] diff --git a/include/micm/cuda/util/cuda_dense_matrix.hpp b/include/micm/cuda/util/cuda_dense_matrix.hpp index 2546cad77..37a7943cd 100644 --- a/include/micm/cuda/util/cuda_dense_matrix.hpp +++ b/include/micm/cuda/util/cuda_dense_matrix.hpp @@ -229,8 +229,15 @@ namespace micm /// @param val Value to set each element to void Fill(T val) { - std::fill(this->data_.begin(), this->data_.end(), val); - CHECK_CUDA_ERROR(micm::cuda::CopyToDevice(this->param_, this->data_), "cudaMemcpyHostToDevice"); + if constexpr (std::is_same_v) + { + // the cudaMemset function only works for integer types and is an asynchronous function: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1gf7338650f7683c51ee26aadc6973c63a + CHECK_CUDA_ERROR(cudaMemset(this->param_.d_data_, val, sizeof(double) * this->param_.number_of_elements_), "cudaMemset"); + } + else + { + CHECK_CUDA_ERROR(micm::cuda::FillCudaMatrix(this->param_, val), "FillCudaMatrix"); + } } }; // class CudaDenseMatrix diff --git a/include/micm/cuda/util/cuda_matrix.cuh b/include/micm/cuda/util/cuda_matrix.cuh index e6d548a15..bd654dac0 100644 --- a/include/micm/cuda/util/cuda_matrix.cuh +++ b/include/micm/cuda/util/cuda_matrix.cuh @@ -40,5 +40,12 @@ namespace micm /// @param vectorMatrixSrc Struct containing allocated source device memory to copy from /// @returns Error code from copying to destination device memory from source device memory, if any cudaError_t CopyToDeviceFromDevice(CudaMatrixParam& vectorMatrixDest, const CudaMatrixParam& vectorMatrixSrc); + + + /// @brief Fills a CUDA matrix with a specified value + /// @param param Struct containing allocated device memory + /// @param val Value to fill the matrix with + template + cudaError_t FillCudaMatrix(CudaMatrixParam& param, T val); } // namespace cuda } // namespace micm diff --git a/include/micm/cuda/util/cuda_sparse_matrix.hpp b/include/micm/cuda/util/cuda_sparse_matrix.hpp index 84d730e71..a40fc81a1 100644 --- a/include/micm/cuda/util/cuda_sparse_matrix.hpp +++ b/include/micm/cuda/util/cuda_sparse_matrix.hpp @@ -129,8 +129,15 @@ namespace micm /// @param val Value to set each element to void Fill(T val) { - std::fill(this->data_.begin(), this->data_.end(), val); - CHECK_CUDA_ERROR(micm::cuda::CopyToDevice(this->param_, this->data_), "cudaMemcpyHostToDevice"); + if constexpr (std::is_same_v) + { + // the cudaMemset function only works for integer types and is an asynchronous function: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1gf7338650f7683c51ee26aadc6973c63a + CHECK_CUDA_ERROR(cudaMemset(this->param_.d_data_, val, sizeof(double) * this->param_.number_of_elements_), "cudaMemset"); + } + else + { + CHECK_CUDA_ERROR(micm::cuda::FillCudaMatrix(this->param_, val), "FillCudaMatrix"); + } } CudaMatrixParam AsDeviceParam() const diff --git a/include/micm/util/sparse_matrix_standard_ordering.hpp b/include/micm/util/sparse_matrix_standard_ordering.hpp index 604cbc644..b7c65dc63 100644 --- a/include/micm/util/sparse_matrix_standard_ordering.hpp +++ b/include/micm/util/sparse_matrix_standard_ordering.hpp @@ -25,7 +25,7 @@ namespace micm const std::vector& row_start) { return number_of_blocks * row_ids.size(); - }; + } static std::size_t VectorIndex( std::size_t number_of_blocks, @@ -43,7 +43,7 @@ namespace micm if (elem == end) throw std::system_error(make_error_code(MicmMatrixErrc::ZeroElementAccess)); return std::size_t{ (elem - row_ids.begin()) + block * row_ids.size() }; - }; + } static void AddToDiagonal( const std::vector& diagonal_ids, @@ -55,6 +55,6 @@ namespace micm for (std::size_t block_start = 0; block_start < number_of_blocks * block_size; block_start += block_size) for (const auto& i : diagonal_ids) data[block_start + i] += value; - }; + } }; } // namespace micm diff --git a/src/util/cuda_matrix.cu b/src/util/cuda_matrix.cu index d92a1c7a4..1f4b97361 100644 --- a/src/util/cuda_matrix.cu +++ b/src/util/cuda_matrix.cu @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 #include #include - +#include #include #include @@ -55,5 +55,27 @@ namespace micm cudaMemcpyDeviceToDevice); return err; } + + template + __global__ void FillCudaMatrixKernel(T* d_data, std::size_t number_of_elements, T val) + { + std::size_t tid = blockIdx.x * BLOCK_SIZE + threadIdx.x; + if (tid < number_of_elements) + { + d_data[tid] = val; + } + } + + template + cudaError_t FillCudaMatrix(CudaMatrixParam& param, T val) + { + std::size_t number_of_blocks = (param.number_of_elements_ + BLOCK_SIZE - 1) / BLOCK_SIZE; + FillCudaMatrixKernel<<>>(param.d_data_, param.number_of_elements_, val); + cudaError_t err = cudaGetLastError(); + return err; + } + + // source code needs the instantiation of the template + template cudaError_t FillCudaMatrix(CudaMatrixParam& param, double val); } // namespace cuda } // namespace micm