Update fill function for CUDA matrix (#626)

* update the fill function for cuda matrix to avoid data transfer * fix compilation errors * add a comment about template function * update fill function for cuda sparse matrix * remove gcc11 CI test and add gcc14 CI test
NCAR · Aug 21, 2024 · 2062687 · 2062687
1 parent 649cb0e
commit 2062687
Show file tree

Hide file tree

Showing 6 changed files with 52 additions and 9 deletions.
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
@@ -67,7 +67,7 @@ jobs:
     strategy:
       matrix:
         compiler:
-          - { cpp: g++-11, c: gcc-11}
+          - { cpp: g++-14, c: gcc-14}
           - { cpp: g++-12, c: gcc-12}
           - { cpp: clang++, c: clang}
         build_type: [Release]

diff --git a/include/micm/cuda/util/cuda_dense_matrix.hpp b/include/micm/cuda/util/cuda_dense_matrix.hpp
@@ -229,8 +229,15 @@ namespace micm
     /// @param val Value to set each element to
     void Fill(T val)
     {
-      std::fill(this->data_.begin(), this->data_.end(), val);
-      CHECK_CUDA_ERROR(micm::cuda::CopyToDevice(this->param_, this->data_), "cudaMemcpyHostToDevice");
+      if constexpr (std::is_same_v<T, int>)
+      {
+        // the cudaMemset function only works for integer types and is an asynchronous function: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1gf7338650f7683c51ee26aadc6973c63a
+        CHECK_CUDA_ERROR(cudaMemset(this->param_.d_data_, val, sizeof(double) * this->param_.number_of_elements_), "cudaMemset");
+      }
+      else
+      {
+        CHECK_CUDA_ERROR(micm::cuda::FillCudaMatrix<T>(this->param_, val), "FillCudaMatrix");
+      }
     }
 
   };  // class CudaDenseMatrix

diff --git a/include/micm/cuda/util/cuda_matrix.cuh b/include/micm/cuda/util/cuda_matrix.cuh
@@ -40,5 +40,12 @@ namespace micm
     /// @param vectorMatrixSrc Struct containing allocated source device memory to copy from
     /// @returns Error code from copying to destination device memory from source device memory, if any
     cudaError_t CopyToDeviceFromDevice(CudaMatrixParam& vectorMatrixDest, const CudaMatrixParam& vectorMatrixSrc);
+
+
+    /// @brief Fills a CUDA matrix with a specified value
+    /// @param param Struct containing allocated device memory
+    /// @param val Value to fill the matrix with
+    template<typename T>
+    cudaError_t FillCudaMatrix(CudaMatrixParam& param, T val);
   }  // namespace cuda
 }  // namespace micm
diff --git a/include/micm/cuda/util/cuda_sparse_matrix.hpp b/include/micm/cuda/util/cuda_sparse_matrix.hpp
@@ -129,8 +129,15 @@ namespace micm
     /// @param val Value to set each element to
     void Fill(T val)
     {
-      std::fill(this->data_.begin(), this->data_.end(), val);
-      CHECK_CUDA_ERROR(micm::cuda::CopyToDevice(this->param_, this->data_), "cudaMemcpyHostToDevice");
+      if constexpr (std::is_same_v<T, int>)
+      {
+        // the cudaMemset function only works for integer types and is an asynchronous function: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1gf7338650f7683c51ee26aadc6973c63a
+        CHECK_CUDA_ERROR(cudaMemset(this->param_.d_data_, val, sizeof(double) * this->param_.number_of_elements_), "cudaMemset");
+      }
+      else
+      {
+        CHECK_CUDA_ERROR(micm::cuda::FillCudaMatrix<T>(this->param_, val), "FillCudaMatrix");
+      }
     }
 
     CudaMatrixParam AsDeviceParam() const

diff --git a/include/micm/util/sparse_matrix_standard_ordering.hpp b/include/micm/util/sparse_matrix_standard_ordering.hpp
@@ -25,7 +25,7 @@ namespace micm
         const std::vector<std::size_t>& row_start)
     {
       return number_of_blocks * row_ids.size();
-    };
+    }
 
     static std::size_t VectorIndex(
         std::size_t number_of_blocks,
@@ -43,7 +43,7 @@ namespace micm
       if (elem == end)
         throw std::system_error(make_error_code(MicmMatrixErrc::ZeroElementAccess));
       return std::size_t{ (elem - row_ids.begin()) + block * row_ids.size() };
-    };
+    }
 
     static void AddToDiagonal(
         const std::vector<std::size_t>& diagonal_ids,
@@ -55,6 +55,6 @@ namespace micm
       for (std::size_t block_start = 0; block_start < number_of_blocks * block_size; block_start += block_size)
         for (const auto& i : diagonal_ids)
           data[block_start + i] += value;
-    };
+    }
   };
 }  // namespace micm
diff --git a/src/util/cuda_matrix.cu b/src/util/cuda_matrix.cu
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 #include <micm/cuda/util/cuda_matrix.cuh>
 #include <micm/util/internal_error.hpp>
-
+#include <micm/cuda/util/cuda_param.hpp>
 #include <cuda_runtime.h>
 
 #include <vector>
@@ -55,5 +55,27 @@ namespace micm
           cudaMemcpyDeviceToDevice);
       return err;
     }
+
+    template<typename T>
+    __global__ void FillCudaMatrixKernel(T* d_data, std::size_t number_of_elements, T val)
+    {
+      std::size_t tid = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+      if (tid < number_of_elements)
+      {
+        d_data[tid] = val;
+      }
+    }
+
+    template<typename T>
+    cudaError_t FillCudaMatrix(CudaMatrixParam& param, T val)
+    {
+      std::size_t number_of_blocks = (param.number_of_elements_ + BLOCK_SIZE - 1) / BLOCK_SIZE;
+      FillCudaMatrixKernel<<<number_of_blocks, BLOCK_SIZE>>>(param.d_data_, param.number_of_elements_, val);
+      cudaError_t err = cudaGetLastError();
+      return err;
+    }
+
+    // source code needs the instantiation of the template
+    template cudaError_t FillCudaMatrix<double>(CudaMatrixParam& param, double val);
   }  // namespace cuda
 }  // namespace micm