diff --git a/include/micm/cuda/util/cuda_dense_matrix.hpp b/include/micm/cuda/util/cuda_dense_matrix.hpp index 4fd412c83..13b3552da 100644 --- a/include/micm/cuda/util/cuda_dense_matrix.hpp +++ b/include/micm/cuda/util/cuda_dense_matrix.hpp @@ -210,7 +210,13 @@ namespace micm { // the cudaMemset function only works for integer types and is an asynchronous function: // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1gf7338650f7683c51ee26aadc6973c63a - CHECK_CUDA_ERROR(cudaMemsetAsync(this->param_.d_data_, val, sizeof(T) * this->param_.number_of_elements_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMemset"); + CHECK_CUDA_ERROR( + cudaMemsetAsync( + this->param_.d_data_, + val, + sizeof(T) * this->param_.number_of_elements_, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMemset"); } else { diff --git a/include/micm/cuda/util/cuda_util.cuh b/include/micm/cuda/util/cuda_util.cuh index bf4b7bd9e..4fc82bdf4 100644 --- a/include/micm/cuda/util/cuda_util.cuh +++ b/include/micm/cuda/util/cuda_util.cuh @@ -6,9 +6,9 @@ #include #include -#include #include #include +#include #define CHECK_CUDA_ERROR(err, msg) micm::cuda::CheckCudaError(err, __FILE__, __LINE__, msg) #define CHECK_CUBLAS_ERROR(err, msg) micm::cuda::CheckCublasError(err, __FILE__, __LINE__, msg) @@ -54,24 +54,23 @@ namespace micm /// @brief Singleton class to manage CUDA streams class CudaStreamSingleton { - public: - + public: ~CudaStreamSingleton() = default; - + CudaStreamSingleton(const CudaStreamSingleton&) = delete; CudaStreamSingleton& operator=(const CudaStreamSingleton&) = delete; // Get the only one instance of the singleton class static CudaStreamSingleton& GetInstance(); - + // Get the CUDA stream given a stream ID cudaStream_t& GetCudaStream(std::size_t stream_id); // Empty the map variable to clean up all CUDA streams void CleanUp(); - - private: + + private: // Private constructor to prevent direct instantiation CudaStreamSingleton() = default; diff --git a/src/process/process_set.cu b/src/process/process_set.cu index 92ffb2d02..67c9e3431 100644 --- a/src/process/process_set.cu +++ b/src/process/process_set.cu @@ -136,11 +136,32 @@ namespace micm ProcessSetParam devstruct; /// Allocate memory space on the device - CHECK_CUDA_ERROR(cudaMallocAsync(&(devstruct.number_of_reactants_), number_of_reactants_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMalloc"); - CHECK_CUDA_ERROR(cudaMallocAsync(&(devstruct.reactant_ids_), reactant_ids_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMalloc"); - CHECK_CUDA_ERROR(cudaMallocAsync(&(devstruct.number_of_products_), number_of_products_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMalloc"); - CHECK_CUDA_ERROR(cudaMallocAsync(&(devstruct.product_ids_), product_ids_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMalloc"); - CHECK_CUDA_ERROR(cudaMallocAsync(&(devstruct.yields_), yields_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMalloc"); + CHECK_CUDA_ERROR( + cudaMallocAsync( + &(devstruct.number_of_reactants_), + number_of_reactants_bytes, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMalloc"); + CHECK_CUDA_ERROR( + cudaMallocAsync( + &(devstruct.reactant_ids_), + reactant_ids_bytes, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMalloc"); + CHECK_CUDA_ERROR( + cudaMallocAsync( + &(devstruct.number_of_products_), + number_of_products_bytes, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMalloc"); + CHECK_CUDA_ERROR( + cudaMallocAsync( + &(devstruct.product_ids_), product_ids_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMalloc"); + CHECK_CUDA_ERROR( + cudaMallocAsync( + &(devstruct.yields_), yields_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMalloc"); /// Copy the data from host to device CHECK_CUDA_ERROR( @@ -152,7 +173,12 @@ namespace micm micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMemcpy"); CHECK_CUDA_ERROR( - cudaMemcpyAsync(devstruct.reactant_ids_, hoststruct.reactant_ids_, reactant_ids_bytes, cudaMemcpyHostToDevice, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + cudaMemcpyAsync( + devstruct.reactant_ids_, + hoststruct.reactant_ids_, + reactant_ids_bytes, + cudaMemcpyHostToDevice, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMemcpy"); CHECK_CUDA_ERROR( cudaMemcpyAsync( @@ -163,10 +189,21 @@ namespace micm micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMemcpy"); CHECK_CUDA_ERROR( - cudaMemcpyAsync(devstruct.product_ids_, hoststruct.product_ids_, product_ids_bytes, cudaMemcpyHostToDevice, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + cudaMemcpyAsync( + devstruct.product_ids_, + hoststruct.product_ids_, + product_ids_bytes, + cudaMemcpyHostToDevice, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMemcpy"); CHECK_CUDA_ERROR( - cudaMemcpyAsync(devstruct.yields_, hoststruct.yields_, yields_bytes, cudaMemcpyHostToDevice, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMemcpy"); + cudaMemcpyAsync( + devstruct.yields_, + hoststruct.yields_, + yields_bytes, + cudaMemcpyHostToDevice, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMemcpy"); devstruct.number_of_reactants_size_ = hoststruct.number_of_reactants_size_; devstruct.reactant_ids_size_ = hoststruct.reactant_ids_size_; @@ -186,12 +223,21 @@ namespace micm size_t jacobian_flat_ids_bytes = sizeof(size_t) * hoststruct.jacobian_flat_ids_size_; /// Allocate memory space on the device - CHECK_CUDA_ERROR(cudaMallocAsync(&(devstruct.jacobian_flat_ids_), jacobian_flat_ids_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMalloc"); + CHECK_CUDA_ERROR( + cudaMallocAsync( + &(devstruct.jacobian_flat_ids_), + jacobian_flat_ids_bytes, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMalloc"); /// Copy the data from host to device CHECK_CUDA_ERROR( cudaMemcpyAsync( - devstruct.jacobian_flat_ids_, hoststruct.jacobian_flat_ids_, jacobian_flat_ids_bytes, cudaMemcpyHostToDevice, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + devstruct.jacobian_flat_ids_, + hoststruct.jacobian_flat_ids_, + jacobian_flat_ids_bytes, + cudaMemcpyHostToDevice, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMemcpy"); devstruct.jacobian_flat_ids_size_ = hoststruct.jacobian_flat_ids_size_; @@ -202,18 +248,29 @@ namespace micm void FreeConstData(ProcessSetParam& devstruct) { if (devstruct.number_of_reactants_ != nullptr) - CHECK_CUDA_ERROR(cudaFreeAsync(devstruct.number_of_reactants_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); + CHECK_CUDA_ERROR( + cudaFreeAsync(devstruct.number_of_reactants_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaFree"); if (devstruct.reactant_ids_ != nullptr) - CHECK_CUDA_ERROR(cudaFreeAsync(devstruct.reactant_ids_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); + CHECK_CUDA_ERROR( + cudaFreeAsync(devstruct.reactant_ids_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaFree"); if (devstruct.number_of_products_ != nullptr) - CHECK_CUDA_ERROR(cudaFreeAsync(devstruct.number_of_products_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); + CHECK_CUDA_ERROR( + cudaFreeAsync(devstruct.number_of_products_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaFree"); if (devstruct.product_ids_ != nullptr) - CHECK_CUDA_ERROR(cudaFreeAsync(devstruct.product_ids_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); + CHECK_CUDA_ERROR( + cudaFreeAsync(devstruct.product_ids_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaFree"); if (devstruct.yields_ != nullptr) - CHECK_CUDA_ERROR(cudaFreeAsync(devstruct.yields_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); + CHECK_CUDA_ERROR( + cudaFreeAsync(devstruct.yields_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); if (devstruct.jacobian_flat_ids_ != nullptr) { - CHECK_CUDA_ERROR(cudaFreeAsync(devstruct.jacobian_flat_ids_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); + CHECK_CUDA_ERROR( + cudaFreeAsync(devstruct.jacobian_flat_ids_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaFree"); } } @@ -224,7 +281,11 @@ namespace micm const ProcessSetParam& devstruct) { size_t number_of_blocks = (rate_constants_param.number_of_grid_cells_ + BLOCK_SIZE - 1) / BLOCK_SIZE; - SubtractJacobianTermsKernel<<>>( + SubtractJacobianTermsKernel<<< + number_of_blocks, + BLOCK_SIZE, + 0, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)>>>( rate_constants_param, state_variables_param, jacobian_param, devstruct); } // end of SubtractJacobianTermsKernelDriver @@ -235,7 +296,11 @@ namespace micm const ProcessSetParam& devstruct) { size_t number_of_blocks = (rate_constants_param.number_of_grid_cells_ + BLOCK_SIZE - 1) / BLOCK_SIZE; - AddForcingTermsKernel<<>>( + AddForcingTermsKernel<<< + number_of_blocks, + BLOCK_SIZE, + 0, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)>>>( rate_constants_param, state_variables_param, forcing_param, devstruct); } // end of AddForcingTermsKernelDriver } // namespace cuda diff --git a/src/solver/linear_solver.cu b/src/solver/linear_solver.cu index 09e5c791b..1ffa1ef1c 100644 --- a/src/solver/linear_solver.cu +++ b/src/solver/linear_solver.cu @@ -89,20 +89,56 @@ namespace micm /// Create a struct whose members contain the addresses in the device memory. LinearSolverParam devstruct; - CHECK_CUDA_ERROR(cudaMallocAsync(&(devstruct.nLij_Lii_), nLij_Lii_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMalloc"); - CHECK_CUDA_ERROR(cudaMallocAsync(&(devstruct.Lij_yj_), Lij_yj_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMalloc"); - CHECK_CUDA_ERROR(cudaMallocAsync(&(devstruct.nUij_Uii_), nUij_Uii_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMalloc"); - CHECK_CUDA_ERROR(cudaMallocAsync(&(devstruct.Uij_xj_), Uij_xj_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMalloc"); + CHECK_CUDA_ERROR( + cudaMallocAsync( + &(devstruct.nLij_Lii_), nLij_Lii_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMalloc"); + CHECK_CUDA_ERROR( + cudaMallocAsync( + &(devstruct.Lij_yj_), Lij_yj_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMalloc"); + CHECK_CUDA_ERROR( + cudaMallocAsync( + &(devstruct.nUij_Uii_), nUij_Uii_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMalloc"); + CHECK_CUDA_ERROR( + cudaMallocAsync( + &(devstruct.Uij_xj_), Uij_xj_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMalloc"); /// Copy the data from host to device CHECK_CUDA_ERROR( - cudaMemcpyAsync(devstruct.nLij_Lii_, hoststruct.nLij_Lii_, nLij_Lii_bytes, cudaMemcpyHostToDevice, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMemcpy"); + cudaMemcpyAsync( + devstruct.nLij_Lii_, + hoststruct.nLij_Lii_, + nLij_Lii_bytes, + cudaMemcpyHostToDevice, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMemcpy"); CHECK_CUDA_ERROR( - cudaMemcpyAsync(devstruct.Lij_yj_, hoststruct.Lij_yj_, Lij_yj_bytes, cudaMemcpyHostToDevice, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMemcpy"); + cudaMemcpyAsync( + devstruct.Lij_yj_, + hoststruct.Lij_yj_, + Lij_yj_bytes, + cudaMemcpyHostToDevice, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMemcpy"); CHECK_CUDA_ERROR( - cudaMemcpyAsync(devstruct.nUij_Uii_, hoststruct.nUij_Uii_, nUij_Uii_bytes, cudaMemcpyHostToDevice, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMemcpy"); + cudaMemcpyAsync( + devstruct.nUij_Uii_, + hoststruct.nUij_Uii_, + nUij_Uii_bytes, + cudaMemcpyHostToDevice, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMemcpy"); CHECK_CUDA_ERROR( - cudaMemcpyAsync(devstruct.Uij_xj_, hoststruct.Uij_xj_, Uij_xj_bytes, cudaMemcpyHostToDevice, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMemcpy"); + cudaMemcpyAsync( + devstruct.Uij_xj_, + hoststruct.Uij_xj_, + Uij_xj_bytes, + cudaMemcpyHostToDevice, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMemcpy"); devstruct.nLij_Lii_size_ = hoststruct.nLij_Lii_size_; devstruct.Lij_yj_size_ = hoststruct.Lij_yj_size_; @@ -117,13 +153,17 @@ namespace micm void FreeConstData(LinearSolverParam& devstruct) { if (devstruct.nLij_Lii_ != nullptr) - CHECK_CUDA_ERROR(cudaFreeAsync(devstruct.nLij_Lii_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); + CHECK_CUDA_ERROR( + cudaFreeAsync(devstruct.nLij_Lii_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); if (devstruct.Lij_yj_ != nullptr) - CHECK_CUDA_ERROR(cudaFreeAsync(devstruct.Lij_yj_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); + CHECK_CUDA_ERROR( + cudaFreeAsync(devstruct.Lij_yj_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); if (devstruct.nUij_Uii_ != nullptr) - CHECK_CUDA_ERROR(cudaFreeAsync(devstruct.nUij_Uii_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); + CHECK_CUDA_ERROR( + cudaFreeAsync(devstruct.nUij_Uii_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); if (devstruct.Uij_xj_ != nullptr) - CHECK_CUDA_ERROR(cudaFreeAsync(devstruct.Uij_xj_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); + CHECK_CUDA_ERROR( + cudaFreeAsync(devstruct.Uij_xj_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); } void SolveKernelDriver( @@ -133,7 +173,8 @@ namespace micm const LinearSolverParam& devstruct) { size_t number_of_blocks = (x_param.number_of_grid_cells_ + BLOCK_SIZE - 1) / BLOCK_SIZE; - SolveKernel<<>>(x_param, L_param, U_param, devstruct); + SolveKernel<<>>( + x_param, L_param, U_param, devstruct); } } // namespace cuda } // namespace micm \ No newline at end of file diff --git a/src/solver/lu_decomposition.cu b/src/solver/lu_decomposition.cu index 6ce66fab0..3941cd8c1 100644 --- a/src/solver/lu_decomposition.cu +++ b/src/solver/lu_decomposition.cu @@ -128,35 +128,128 @@ namespace micm /// Create a struct whose members contain the addresses in the device memory. LuDecomposeParam devstruct; - CHECK_CUDA_ERROR(cudaMallocAsync(&(devstruct.niLU_), niLU_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMalloc"); - CHECK_CUDA_ERROR(cudaMallocAsync(&(devstruct.do_aik_), do_aik_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMalloc"); - CHECK_CUDA_ERROR(cudaMallocAsync(&(devstruct.aik_), aik_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMalloc"); - CHECK_CUDA_ERROR(cudaMallocAsync(&(devstruct.uik_nkj_), uik_nkj_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMalloc"); - CHECK_CUDA_ERROR(cudaMallocAsync(&(devstruct.lij_ujk_), lij_ujk_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMalloc"); - CHECK_CUDA_ERROR(cudaMallocAsync(&(devstruct.do_aki_), do_aki_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMalloc"); - CHECK_CUDA_ERROR(cudaMallocAsync(&(devstruct.aki_), aki_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMalloc"); - CHECK_CUDA_ERROR(cudaMallocAsync(&(devstruct.lki_nkj_), lki_nkj_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMalloc"); - CHECK_CUDA_ERROR(cudaMallocAsync(&(devstruct.lkj_uji_), lkj_uji_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMalloc"); - CHECK_CUDA_ERROR(cudaMallocAsync(&(devstruct.uii_), uii_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMalloc"); - CHECK_CUDA_ERROR(cudaMallocAsync(&devstruct.is_singular, sizeof(bool), micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMalloc"); + CHECK_CUDA_ERROR( + cudaMallocAsync(&(devstruct.niLU_), niLU_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMalloc"); + CHECK_CUDA_ERROR( + cudaMallocAsync( + &(devstruct.do_aik_), do_aik_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMalloc"); + CHECK_CUDA_ERROR( + cudaMallocAsync(&(devstruct.aik_), aik_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMalloc"); + CHECK_CUDA_ERROR( + cudaMallocAsync( + &(devstruct.uik_nkj_), uik_nkj_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMalloc"); + CHECK_CUDA_ERROR( + cudaMallocAsync( + &(devstruct.lij_ujk_), lij_ujk_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMalloc"); + CHECK_CUDA_ERROR( + cudaMallocAsync( + &(devstruct.do_aki_), do_aki_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMalloc"); + CHECK_CUDA_ERROR( + cudaMallocAsync(&(devstruct.aki_), aki_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMalloc"); + CHECK_CUDA_ERROR( + cudaMallocAsync( + &(devstruct.lki_nkj_), lki_nkj_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMalloc"); + CHECK_CUDA_ERROR( + cudaMallocAsync( + &(devstruct.lkj_uji_), lkj_uji_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMalloc"); + CHECK_CUDA_ERROR( + cudaMallocAsync(&(devstruct.uii_), uii_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMalloc"); + CHECK_CUDA_ERROR( + cudaMallocAsync( + &devstruct.is_singular, sizeof(bool), micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMalloc"); /// Copy the data from host to device - CHECK_CUDA_ERROR(cudaMemcpyAsync(devstruct.niLU_, hoststruct.niLU_, niLU_bytes, cudaMemcpyHostToDevice, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMemcpy"); CHECK_CUDA_ERROR( - cudaMemcpyAsync(devstruct.do_aik_, hoststruct.do_aik_, do_aik_bytes, cudaMemcpyHostToDevice, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMemcpy"); - CHECK_CUDA_ERROR(cudaMemcpyAsync(devstruct.aik_, hoststruct.aik_, aik_bytes, cudaMemcpyHostToDevice, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMemcpy"); + cudaMemcpyAsync( + devstruct.niLU_, + hoststruct.niLU_, + niLU_bytes, + cudaMemcpyHostToDevice, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMemcpy"); + CHECK_CUDA_ERROR( + cudaMemcpyAsync( + devstruct.do_aik_, + hoststruct.do_aik_, + do_aik_bytes, + cudaMemcpyHostToDevice, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMemcpy"); + CHECK_CUDA_ERROR( + cudaMemcpyAsync( + devstruct.aik_, + hoststruct.aik_, + aik_bytes, + cudaMemcpyHostToDevice, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMemcpy"); + CHECK_CUDA_ERROR( + cudaMemcpyAsync( + devstruct.uik_nkj_, + hoststruct.uik_nkj_, + uik_nkj_bytes, + cudaMemcpyHostToDevice, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMemcpy"); + CHECK_CUDA_ERROR( + cudaMemcpyAsync( + devstruct.lij_ujk_, + hoststruct.lij_ujk_, + lij_ujk_bytes, + cudaMemcpyHostToDevice, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMemcpy"); CHECK_CUDA_ERROR( - cudaMemcpyAsync(devstruct.uik_nkj_, hoststruct.uik_nkj_, uik_nkj_bytes, cudaMemcpyHostToDevice, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMemcpy"); + cudaMemcpyAsync( + devstruct.do_aki_, + hoststruct.do_aki_, + do_aki_bytes, + cudaMemcpyHostToDevice, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMemcpy"); CHECK_CUDA_ERROR( - cudaMemcpyAsync(devstruct.lij_ujk_, hoststruct.lij_ujk_, lij_ujk_bytes, cudaMemcpyHostToDevice, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMemcpy"); + cudaMemcpyAsync( + devstruct.aki_, + hoststruct.aki_, + aki_bytes, + cudaMemcpyHostToDevice, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMemcpy"); CHECK_CUDA_ERROR( - cudaMemcpyAsync(devstruct.do_aki_, hoststruct.do_aki_, do_aki_bytes, cudaMemcpyHostToDevice, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMemcpy"); - CHECK_CUDA_ERROR(cudaMemcpyAsync(devstruct.aki_, hoststruct.aki_, aki_bytes, cudaMemcpyHostToDevice, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMemcpy"); + cudaMemcpyAsync( + devstruct.lki_nkj_, + hoststruct.lki_nkj_, + lki_nkj_bytes, + cudaMemcpyHostToDevice, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMemcpy"); CHECK_CUDA_ERROR( - cudaMemcpyAsync(devstruct.lki_nkj_, hoststruct.lki_nkj_, lki_nkj_bytes, cudaMemcpyHostToDevice, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMemcpy"); + cudaMemcpyAsync( + devstruct.lkj_uji_, + hoststruct.lkj_uji_, + lkj_uji_bytes, + cudaMemcpyHostToDevice, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMemcpy"); CHECK_CUDA_ERROR( - cudaMemcpyAsync(devstruct.lkj_uji_, hoststruct.lkj_uji_, lkj_uji_bytes, cudaMemcpyHostToDevice, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMemcpy"); - CHECK_CUDA_ERROR(cudaMemcpyAsync(devstruct.uii_, hoststruct.uii_, uii_bytes, cudaMemcpyHostToDevice, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMemcpy"); + cudaMemcpyAsync( + devstruct.uii_, + hoststruct.uii_, + uii_bytes, + cudaMemcpyHostToDevice, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMemcpy"); devstruct.niLU_size_ = hoststruct.niLU_size_; return devstruct; @@ -167,27 +260,39 @@ namespace micm void FreeConstData(LuDecomposeParam& devstruct) { if (devstruct.is_singular != nullptr) - CHECK_CUDA_ERROR(cudaFreeAsync(devstruct.is_singular, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); + CHECK_CUDA_ERROR( + cudaFreeAsync(devstruct.is_singular, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaFree"); if (devstruct.niLU_ != nullptr) - CHECK_CUDA_ERROR(cudaFreeAsync(devstruct.niLU_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); + CHECK_CUDA_ERROR( + cudaFreeAsync(devstruct.niLU_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); if (devstruct.do_aik_ != nullptr) - CHECK_CUDA_ERROR(cudaFreeAsync(devstruct.do_aik_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); + CHECK_CUDA_ERROR( + cudaFreeAsync(devstruct.do_aik_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); if (devstruct.aik_ != nullptr) - CHECK_CUDA_ERROR(cudaFreeAsync(devstruct.aik_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); + CHECK_CUDA_ERROR( + cudaFreeAsync(devstruct.aik_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); if (devstruct.uik_nkj_ != nullptr) - CHECK_CUDA_ERROR(cudaFreeAsync(devstruct.uik_nkj_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); + CHECK_CUDA_ERROR( + cudaFreeAsync(devstruct.uik_nkj_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); if (devstruct.lij_ujk_ != nullptr) - CHECK_CUDA_ERROR(cudaFreeAsync(devstruct.lij_ujk_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); + CHECK_CUDA_ERROR( + cudaFreeAsync(devstruct.lij_ujk_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); if (devstruct.do_aki_ != nullptr) - CHECK_CUDA_ERROR(cudaFreeAsync(devstruct.do_aki_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); + CHECK_CUDA_ERROR( + cudaFreeAsync(devstruct.do_aki_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); if (devstruct.aki_ != nullptr) - CHECK_CUDA_ERROR(cudaFreeAsync(devstruct.aki_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); + CHECK_CUDA_ERROR( + cudaFreeAsync(devstruct.aki_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); if (devstruct.lki_nkj_ != nullptr) - CHECK_CUDA_ERROR(cudaFreeAsync(devstruct.lki_nkj_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); + CHECK_CUDA_ERROR( + cudaFreeAsync(devstruct.lki_nkj_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); if (devstruct.lkj_uji_ != nullptr) - CHECK_CUDA_ERROR(cudaFreeAsync(devstruct.lkj_uji_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); + CHECK_CUDA_ERROR( + cudaFreeAsync(devstruct.lkj_uji_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); if (devstruct.uii_ != nullptr) - CHECK_CUDA_ERROR(cudaFreeAsync(devstruct.uii_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); + CHECK_CUDA_ERROR( + cudaFreeAsync(devstruct.uii_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); } void DecomposeKernelDriver( @@ -199,9 +304,15 @@ namespace micm { // Launch the CUDA kernel for LU decomposition size_t number_of_blocks = (A_param.number_of_grid_cells_ + BLOCK_SIZE - 1) / BLOCK_SIZE; - DecomposeKernel<<>>(A_param, L_param, U_param, devstruct); + DecomposeKernel<<>>( + A_param, L_param, U_param, devstruct); // Copy the boolean result from device back to host - cudaMemcpyAsync(&is_singular, devstruct.is_singular, sizeof(bool), cudaMemcpyDeviceToHost, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)); + cudaMemcpyAsync( + &is_singular, + devstruct.is_singular, + sizeof(bool), + cudaMemcpyDeviceToHost, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)); } // end of DecomposeKernelDriver } // end of namespace cuda } // end of namespace micm diff --git a/src/solver/rosenbrock.cu b/src/solver/rosenbrock.cu index c38308acb..59d59f46b 100644 --- a/src/solver/rosenbrock.cu +++ b/src/solver/rosenbrock.cu @@ -45,10 +45,26 @@ namespace micm /// Create a struct whose members contain the addresses in the device memory. CudaRosenbrockSolverParam devstruct; - CHECK_CUDA_ERROR(cudaMallocAsync(&(devstruct.errors_input_), errors_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMalloc"); - CHECK_CUDA_ERROR(cudaMallocAsync(&(devstruct.errors_output_), errors_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMalloc"); - CHECK_CUDA_ERROR(cudaMallocAsync(&(devstruct.jacobian_diagonal_elements_), jacobian_diagonal_elements_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMalloc"); - CHECK_CUDA_ERROR(cudaMallocAsync(&(devstruct.absolute_tolerance_), tolerance_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMalloc"); + CHECK_CUDA_ERROR( + cudaMallocAsync( + &(devstruct.errors_input_), errors_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMalloc"); + CHECK_CUDA_ERROR( + cudaMallocAsync( + &(devstruct.errors_output_), errors_bytes, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMalloc"); + CHECK_CUDA_ERROR( + cudaMallocAsync( + &(devstruct.jacobian_diagonal_elements_), + jacobian_diagonal_elements_bytes, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMalloc"); + CHECK_CUDA_ERROR( + cudaMallocAsync( + &(devstruct.absolute_tolerance_), + tolerance_bytes, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaMalloc"); /// Copy the data from host to device CHECK_CUDA_ERROR( @@ -61,7 +77,12 @@ namespace micm "cudaMemcpy"); CHECK_CUDA_ERROR( - cudaMemcpyAsync(devstruct.absolute_tolerance_, hoststruct.absolute_tolerance_, tolerance_bytes, cudaMemcpyHostToDevice, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + cudaMemcpyAsync( + devstruct.absolute_tolerance_, + hoststruct.absolute_tolerance_, + tolerance_bytes, + cudaMemcpyHostToDevice, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMemcpy"); devstruct.errors_size_ = hoststruct.errors_size_; @@ -76,13 +97,22 @@ namespace micm void FreeConstData(CudaRosenbrockSolverParam& devstruct) { if (devstruct.errors_input_ != nullptr) - CHECK_CUDA_ERROR(cudaFreeAsync(devstruct.errors_input_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); + CHECK_CUDA_ERROR( + cudaFreeAsync(devstruct.errors_input_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaFree"); if (devstruct.errors_output_ != nullptr) - CHECK_CUDA_ERROR(cudaFreeAsync(devstruct.errors_output_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); + CHECK_CUDA_ERROR( + cudaFreeAsync(devstruct.errors_output_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaFree"); if (devstruct.jacobian_diagonal_elements_ != nullptr) - CHECK_CUDA_ERROR(cudaFreeAsync(devstruct.jacobian_diagonal_elements_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); + CHECK_CUDA_ERROR( + cudaFreeAsync( + devstruct.jacobian_diagonal_elements_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaFree"); if (devstruct.absolute_tolerance_ != nullptr) - CHECK_CUDA_ERROR(cudaFreeAsync(devstruct.absolute_tolerance_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaFree"); + CHECK_CUDA_ERROR( + cudaFreeAsync(devstruct.absolute_tolerance_, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + "cudaFree"); } // Specific CUDA device function to do reduction within a warp @@ -246,7 +276,11 @@ namespace micm { size_t number_of_blocks = (devstruct.jacobian_diagonal_elements_size_ * jacobian_param.number_of_grid_cells_ + BLOCK_SIZE - 1) / BLOCK_SIZE; - AlphaMinusJacobianKernel<<>>(jacobian_param, alpha, devstruct); + AlphaMinusJacobianKernel<<< + number_of_blocks, + BLOCK_SIZE, + 0, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)>>>(jacobian_param, alpha, devstruct); } // Host code that will launch the NormalizedError CUDA kernel @@ -268,14 +302,23 @@ namespace micm } CHECK_CUDA_ERROR( cudaMemcpyAsync( - devstruct.errors_input_, errors_param.d_data_, sizeof(double) * number_of_elements, cudaMemcpyDeviceToDevice, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + devstruct.errors_input_, + errors_param.d_data_, + sizeof(double) * number_of_elements, + cudaMemcpyDeviceToDevice, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMemcpy"); if (number_of_elements > 1000000) { // call cublas APIs size_t number_of_blocks = (number_of_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; - ScaledErrorKernel<<>>(y_old_param, y_new_param, ros_param, devstruct); + ScaledErrorKernel<<< + number_of_blocks, + BLOCK_SIZE, + 0, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)>>>( + y_old_param, y_new_param, ros_param, devstruct); // call cublas function to perform the norm: // https://docs.nvidia.com/cuda/cublas/index.html?highlight=dnrm2#cublas-t-nrm2 CHECK_CUBLAS_ERROR( @@ -293,7 +336,11 @@ namespace micm bool is_first_call = true; // Kernel call - NormalizedErrorKernel<<>>( + NormalizedErrorKernel<<< + number_of_blocks, + BLOCK_SIZE, + BLOCK_SIZE * sizeof(double), + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)>>>( y_old_param, y_new_param, ros_param, devstruct, number_of_elements, is_first_call); is_first_call = false; while (number_of_blocks > 1) @@ -303,17 +350,30 @@ namespace micm new_number_of_blocks = std::ceil(std::ceil(number_of_blocks * 1.0 / BLOCK_SIZE) / 2.0); if (new_number_of_blocks <= 1) { - NormalizedErrorKernel<<<1, BLOCK_SIZE, BLOCK_SIZE * sizeof(double), micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)>>>( + NormalizedErrorKernel<<< + 1, + BLOCK_SIZE, + BLOCK_SIZE * sizeof(double), + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)>>>( y_old_param, y_new_param, ros_param, devstruct, number_of_blocks, is_first_call); break; } - NormalizedErrorKernel<<>>( + NormalizedErrorKernel<<< + new_number_of_blocks, + BLOCK_SIZE, + BLOCK_SIZE * sizeof(double), + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)>>>( y_old_param, y_new_param, ros_param, devstruct, number_of_blocks, is_first_call); number_of_blocks = new_number_of_blocks; } CHECK_CUDA_ERROR( - cudaMemcpyAsync(&normalized_error, &devstruct.errors_output_[0], sizeof(double), cudaMemcpyDeviceToHost, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), + cudaMemcpyAsync( + &normalized_error, + &devstruct.errors_output_[0], + sizeof(double), + cudaMemcpyDeviceToHost, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)), "cudaMemcpy"); cudaStreamSynchronize(micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)); normalized_error = std::sqrt(normalized_error / number_of_elements); diff --git a/src/util/cuda_matrix.cu b/src/util/cuda_matrix.cu index 5fc3e3ad6..08a3946bf 100644 --- a/src/util/cuda_matrix.cu +++ b/src/util/cuda_matrix.cu @@ -17,7 +17,8 @@ namespace micm cudaError_t MallocVector(CudaMatrixParam& param, std::size_t number_of_elements) { param.number_of_elements_ = number_of_elements; - cudaError_t err = cudaMallocAsync(&(param.d_data_), sizeof(T) * number_of_elements, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)); + cudaError_t err = cudaMallocAsync( + &(param.d_data_), sizeof(T) * number_of_elements, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)); return err; } @@ -37,16 +38,24 @@ namespace micm template cudaError_t CopyToDevice(CudaMatrixParam& param, std::vector& h_data) { - cudaError_t err = - cudaMemcpyAsync(param.d_data_, h_data.data(), sizeof(T) * param.number_of_elements_, cudaMemcpyHostToDevice, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)); + cudaError_t err = cudaMemcpyAsync( + param.d_data_, + h_data.data(), + sizeof(T) * param.number_of_elements_, + cudaMemcpyHostToDevice, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)); return err; } template cudaError_t CopyToHost(CudaMatrixParam& param, std::vector& h_data) { - cudaError_t err = - cudaMemcpyAsync(h_data.data(), param.d_data_, sizeof(T) * param.number_of_elements_, cudaMemcpyDeviceToHost, micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)); + cudaError_t err = cudaMemcpyAsync( + h_data.data(), + param.d_data_, + sizeof(T) * param.number_of_elements_, + cudaMemcpyDeviceToHost, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)); cudaStreamSynchronize(micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)); return err; } @@ -77,7 +86,11 @@ namespace micm cudaError_t FillCudaMatrix(CudaMatrixParam& param, T val) { std::size_t number_of_blocks = (param.number_of_elements_ + BLOCK_SIZE - 1) / BLOCK_SIZE; - FillCudaMatrixKernel<<>>(param.d_data_, param.number_of_elements_, val); + FillCudaMatrixKernel<<< + number_of_blocks, + BLOCK_SIZE, + 0, + micm::cuda::CudaStreamSingleton::GetInstance().GetCudaStream(0)>>>(param.d_data_, param.number_of_elements_, val); cudaError_t err = cudaGetLastError(); return err; } diff --git a/src/util/cuda_util.cu b/src/util/cuda_util.cu index 62d5bcbe0..5d0d166cf 100644 --- a/src/util/cuda_util.cu +++ b/src/util/cuda_util.cu @@ -76,8 +76,8 @@ namespace micm CudaStreamSingleton& CudaStreamSingleton::GetInstance() { - static CudaStreamSingleton instance; - return instance; + static CudaStreamSingleton instance; + return instance; } // Create a CUDA stream and return a unique pointer to it @@ -86,18 +86,18 @@ namespace micm cudaStream_t* cuda_stream = new cudaStream_t; CHECK_CUDA_ERROR(cudaStreamCreate(cuda_stream), "CUDA stream initialization failed..."); return CudaStreamPtr(cuda_stream, CudaStreamDeleter()); - } + } // Get the CUDA stream given a stream ID cudaStream_t& CudaStreamSingleton::GetCudaStream(std::size_t stream_id) { - std::lock_guard lock(mutex_); + std::lock_guard lock(mutex_); if (auto search = cuda_streams_map_.find(stream_id); search == cuda_streams_map_.end()) { cuda_streams_map_[stream_id] = std::move(CreateCudaStream()); } return *cuda_streams_map_[stream_id]; - } + } // Empty the map variable to clean up all CUDA streams void CudaStreamSingleton::CleanUp() diff --git a/test/unit/cuda/util/cuda_gtest_main.cpp b/test/unit/cuda/util/cuda_gtest_main.cpp index 06b473adf..546ae134f 100644 --- a/test/unit/cuda/util/cuda_gtest_main.cpp +++ b/test/unit/cuda/util/cuda_gtest_main.cpp @@ -1,9 +1,11 @@ -#include #include -int main(int argc, char **argv) { - ::testing::InitGoogleTest(&argc, argv); - int result = RUN_ALL_TESTS(); - micm::cuda::CudaStreamSingleton::GetInstance().CleanUp(); // Ensure cleanup - return result; +#include + +int main(int argc, char **argv) +{ + ::testing::InitGoogleTest(&argc, argv); + int result = RUN_ALL_TESTS(); + micm::cuda::CudaStreamSingleton::GetInstance().CleanUp(); // Ensure cleanup + return result; } \ No newline at end of file diff --git a/test/unit/cuda/util/test_cuda_dense_matrix.cpp b/test/unit/cuda/util/test_cuda_dense_matrix.cpp index 4f1eb105e..8bac62627 100644 --- a/test/unit/cuda/util/test_cuda_dense_matrix.cpp +++ b/test/unit/cuda/util/test_cuda_dense_matrix.cpp @@ -29,7 +29,7 @@ TEST(CudaDenseMatrix, DeviceMemCopy) micm::cuda::SquareDriver(param); micm::cuda::CopyToHost(param, h_vector); micm::cuda::FreeVector(param); - + EXPECT_EQ(h_vector[0], 1 * 1); EXPECT_EQ(h_vector[1], 2 * 2); EXPECT_EQ(h_vector[2], 3 * 3);