Decompose cuda (#257)

* work in progress * work in progress * test first step * typo * changed template * changed template * debug * debug * added matrixParam to AddForcingTermsKernelDriver * changed header * debug * debug * debug * test * passing vector_matrix directly * debug * implemented simple class * add std::vector * debug * ahhh * passing 3 matrix data as a group * deleted const * debug * debug * debug * debug * debug * added matrix data in a group for AddJacobianFunction * debug * store d_rate_constants in struct pointer * using cudaMallocManaged * using cudaMallocManaged * using cudaMallocManaged * device strcut * get all pointers into struct for AddJacobian * debug * debug * passed? * passing struct of processSet to kernelDriver * clean up * removed header * group processSet data members into struct * minor fix * minor fix * get rid of micm from struct * debug * debug * minor fix * minor fix * minor fix * clean up * debug * test * clean up * clean up * clean up * clean up * test * test out for storing pointer to variable * store reused device pointer to variable in kernel * store reused device pointer to variable in kernel * work in progress - 8/31 * added struct for sparseMatrix * added struct for sparseMatrix * sparse matrix in struct * compiling test * compiling test * test run for decompose * test run for decompose function * edited cmake * minor fix * test * test for passing std::vector to .cu file * testing for thrust * moved all vectors * test * cleanup * minor fix * cleanup * added header file for vector * minor fix * debug * chnage ngrids in test * checking on cpu code * bug * trying for smaller thrust vector * vector reallocation failed? * small bug * test memory allocation * clean up vectors * added const * debug * no more vector * added const * removed header * added cmake * edited header file * debug * debug * change dot operator * testing * test * small bug * test * testing * testing cpu code * changed kernel config * changed kernel config * changed kernel config * checking pair reference in GPU * debug * debug * bug * transfer data from device to data * transfer data * get rid of i++ * test * bug * bug * bug * inside the kernel * debug with print statement * debug print * modified test * print * print statement * print * test * debug * debug * printing aik * debug * get rid of ++ inside [] * debug * changed * changed * changed * test * test * print test * testing for one grid * testing for one grid * print A sparseMatrix * print * print A outside of kernel * printing A with pointer * printing A with pointer * print A * check * print lf * print lf * test * added synchronization * try to print with hd * test * check * check * test * print * test * checking niLU second * test * check * test * debug * test * test * added more print * added more print * test * test * more debug * more print * test * more print * more print * more print * more print * more print * more print * more print * more print * test * more print * test * more print * more print * more print * more print * more print * test * test * test * print * print * print * pass? cleanup * changed to expect_near to expect_eq * changed back to expect_near * used constructor from base class * eliminated dervied class constructor * added underscore to all struct members * added more underscore * added more underscore * added more underscore * more modification * added time measurement * GPU test against CPU * added time measurement * added time measurement * vector * addSparseMatrix template * test * typo * added std to vector * minor change * incremental change made to processSet * modified cuda_process_set.cuh * changed to device struct
NCAR · Oct 4, 2023 · 63ace0f · 63ace0f
1 parent 2b3be9c
commit 63ace0f
Show file tree

Hide file tree

Showing 13 changed files with 636 additions and 196 deletions.
diff --git a/include/micm/process/cuda_process_set.cuh b/include/micm/process/cuda_process_set.cuh
@@ -1,17 +1,21 @@
+// Copyright (C) 2023 National Center for Atmospheric Research,
+//
+// SPDX-License-Identifier: Apache-2.0
 #pragma once
 #include <micm/util/cuda_param.hpp>
+#include <micm/util/cuda_param.hpp>
 
 namespace micm
 {
   namespace cuda
   {
     std::chrono::nanoseconds AddForcingTermsKernelDriver(
-        CUDAMatrixParam& matrixParam,
-        CUDAProcessSetParam& processSet); 
+        CudaMatrixParam& matrixParam,
+        CudaProcessSetParam& processSet); 
 
     std::chrono::nanoseconds AddJacobianTermsKernelDriver(
-        CUDAMatrixParam& matrixParam, 
-        CUDASparseMatrixParam& sparseMatrix, 
-        CUDAProcessSetParam& processSet);
+        CudaMatrixParam& matrixParam, 
+        CudaSparseMatrixParam& sparseMatrix, 
+        CudaProcessSetParam& processSet);
   }  // namespace cuda
 }  // namespace micm
diff --git a/include/micm/process/cuda_process_set.hpp b/include/micm/process/cuda_process_set.hpp
@@ -5,6 +5,7 @@
 
 #include <micm/process/process_set.hpp>
 #include <micm/util/cuda_param.hpp>
+#include <micm/util/cuda_param.hpp>
 
 #ifdef USE_CUDA
 #  include <micm/process/cuda_process_set.cuh>
@@ -52,25 +53,27 @@ namespace micm
       const MatrixPolicy<double>& state_variables,
       MatrixPolicy<double>& forcing) const
   {
-    CUDAMatrixParam matrixParam;
-    matrixParam.rate_constants = rate_constants.AsVector().data();
-    matrixParam.state_variables = state_variables.AsVector().data();
-    matrixParam.forcing = forcing.AsVector().data();
-    matrixParam.n_grids = rate_constants.size();
-    matrixParam.n_reactions = rate_constants[0].size();
-    matrixParam.n_species = state_variables[0].size();
+    CudaMatrixParam matrix;
+    matrix.rate_constants_ = rate_constants.AsVector().data(); 
+    matrix.state_variables_ = state_variables.AsVector().data(); 
+    matrix.forcing_ = forcing.AsVector().data(); 
+    matrix.n_grids_ = rate_constants.size(); 
+    matrix.n_reactions_ = rate_constants[0].size(); 
+    matrix.n_species_ = state_variables[0].size();
 
-    CUDAProcessSetParam processSet;
-    processSet.number_of_reactants = number_of_reactants_.data();
-    processSet.reactant_ids = reactant_ids_.data();
-    processSet.reactant_ids_size = reactant_ids_.size();
-    processSet.number_of_products = number_of_products_.data();
-    processSet.product_ids = product_ids_.data();
-    processSet.product_ids_size = product_ids_.size();
-    processSet.yields = yields_.data();
-    processSet.yields_size = yields_.size();
+    CudaProcessSetParam processSet; 
+    processSet.number_of_reactants_ = number_of_reactants_.data(); 
+    processSet.reactant_ids_ = reactant_ids_.data(); 
+    processSet.reactant_ids_size_ = reactant_ids_.size(); 
+    processSet.number_of_products_ = number_of_products_.data(); 
+    processSet.product_ids_ = product_ids_.data(); 
+    processSet.product_ids_size_ = product_ids_.size(); 
+    processSet.yields_ = yields_.data(); 
+    processSet.yields_size_ = yields_.size(); 
 
-    std::chrono::nanoseconds kernel_duration = micm::cuda::AddForcingTermsKernelDriver(matrixParam, processSet);
+    std::chrono::nanoseconds kernel_duration = micm::cuda::AddForcingTermsKernelDriver(
+        matrix,
+        processSet);
     return kernel_duration;  // time performance of kernel function
   }
   template<template<class> class MatrixPolicy, template<class> class SparseMatrixPolicy>
@@ -80,29 +83,31 @@ namespace micm
       const MatrixPolicy<double>& state_variables,
       SparseMatrixPolicy<double>& jacobian) const
   {
-    CUDAMatrixParam matrixParam;
-    matrixParam.rate_constants = rate_constants.AsVector().data();
-    matrixParam.state_variables = state_variables.AsVector().data();
-    matrixParam.n_grids = rate_constants.size();
-    matrixParam.n_reactions = rate_constants[0].size();
-    matrixParam.n_species = state_variables[0].size();
-
-    CUDASparseMatrixParam sparseMatrix;
-    sparseMatrix.jacobian = jacobian.AsVector().data();
-    sparseMatrix.jacobian_size = jacobian.AsVector().size();
-
-    CUDAProcessSetParam processSet;
-    processSet.number_of_reactants = number_of_reactants_.data();
-    processSet.reactant_ids = reactant_ids_.data();
-    processSet.reactant_ids_size = reactant_ids_.size();
-    processSet.number_of_products = number_of_products_.data();
-    processSet.yields = yields_.data();
-    processSet.yields_size = yields_.size();
-    processSet.jacobian_flat_ids = jacobian_flat_ids_.data();
-    processSet.jacobian_flat_ids_size = jacobian_flat_ids_.size();
-
-    std::chrono::nanoseconds kernel_duration =
-        micm::cuda::AddJacobianTermsKernelDriver(matrixParam, sparseMatrix, processSet);
+    CudaMatrixParam matrix; 
+    matrix.rate_constants_ = rate_constants.AsVector().data(); 
+    matrix.state_variables_ = state_variables.AsVector().data(); 
+    matrix.n_grids_ = rate_constants.size(); 
+    matrix.n_reactions_ = rate_constants[0].size(); 
+    matrix.n_species_ = state_variables[0].size(); 
+
+    CudaSparseMatrixParam sparseMatrix; 
+    sparseMatrix.jacobian_ = jacobian.AsVector().data(); 
+    sparseMatrix.jacobian_size_ = jacobian.AsVector().size(); 
+
+    CudaProcessSetParam processSet; 
+    processSet.number_of_reactants_ = number_of_reactants_.data(); 
+    processSet.reactant_ids_ = reactant_ids_.data(); 
+    processSet.reactant_ids_size_ = reactant_ids_.size(); 
+    processSet.number_of_products_ = number_of_products_.data(); 
+    processSet.yields_ = yields_.data(); 
+    processSet.yields_size_ = yields_.size(); 
+    processSet.jacobian_flat_ids_ = jacobian_flat_ids_.data(); 
+    processSet.jacobian_flat_ids_size_ = jacobian_flat_ids_.size(); 
+
+    std::chrono::nanoseconds kernel_duration = micm::cuda::AddJacobianTermsKernelDriver(
+        matrix, 
+        sparseMatrix, 
+        processSet);
     return kernel_duration;  // time performance of kernel function
   }
 }  // namespace micm

diff --git a/include/micm/solver/cuda_lu_decomposition.cuh b/include/micm/solver/cuda_lu_decomposition.cuh
@@ -0,0 +1,15 @@
+// Copyright (C) 2023 National Center for Atmospheric Research,
+//
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <micm/util/cuda_param.hpp>
+#include <vector>
+namespace micm
+{
+  namespace cuda
+  {
+    std::chrono::nanoseconds DecomposeKernelDriver(
+            CudaSparseMatrixParam& sparseMatrix, 
+            CudaSolverParam& solver);
+  }  // namespace cuda
+}  // namespace micm
diff --git a/include/micm/solver/cuda_lu_decomposition.hpp b/include/micm/solver/cuda_lu_decomposition.hpp
@@ -0,0 +1,70 @@
+// Copyright (C) 2023 National Center for Atmospheric Research,
+//
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include<micm/solver/lu_decomposition.hpp>
+#include<micm/util/cuda_param.hpp>
+#include <stdexcept>
+#include <chrono>
+#ifdef USE_CUDA
+#include <micm/solver/cuda_lu_decomposition.cuh>
+#endif 
+
+#ifdef USE_CUDA
+namespace micm{
+    class CudaLuDecomposition: public LuDecomposition{
+    public: 
+    CudaLuDecomposition(){}; 
+
+    template<typename T, template<class> typename SparseMatrixPolicy>
+    requires VectorizableSparse<SparseMatrixPolicy<T>>
+    std::chrono::nanoseconds Decompose(
+        const SparseMatrixPolicy<T>&A, 
+        SparseMatrixPolicy<T>& L, 
+        SparseMatrixPolicy<T>& U) const; 
+    }; 
+
+    template<typename T, template<class> class SparseMatrixPolicy>
+    requires(VectorizableSparse<SparseMatrixPolicy<T>>) 
+    std::chrono::nanoseconds CudaLuDecomposition::Decompose(
+        const SparseMatrixPolicy<T>& A, 
+        SparseMatrixPolicy<T>& L, 
+        SparseMatrixPolicy<T>& U) const
+    {
+        CudaSparseMatrixParam sparseMatrix; 
+        sparseMatrix.A_ = A.AsVector().data(); 
+        sparseMatrix.A_size_ = A.AsVector().size(); 
+        sparseMatrix.L_ = L.AsVector().data(); 
+        sparseMatrix.L_size_ = L.AsVector().size(); 
+        sparseMatrix.U_ = U.AsVector().data(); 
+        sparseMatrix.U_size_ = U.AsVector().size(); 
+        sparseMatrix.n_grids_ = A.size(); 
+
+        CudaSolverParam solver;    
+        solver.do_aik_ = do_aik_.data(); 
+        solver.do_aik_size_ = do_aik_.size(); 
+        solver.aik_ = aik_.data(); 
+        solver.aik_size_ = aik_.size(); 
+        solver.do_aki_ = do_aki_.data(); 
+        solver.do_aki_size_ = do_aki_.size(); 
+        solver.aki_ = aki_.data(); 
+        solver.aki_size_ = aki_.size(); 
+        solver.uii_ = uii_.data(); 
+        solver.uii_size_ = uii_.size(); 
+
+        solver.niLU_ = niLU_.data(); 
+        solver.niLU_size_ = niLU_.size(); 
+        solver.uik_nkj_ = uik_nkj_.data(); 
+        solver.uik_nkj_size_ = uik_nkj_.size(); 
+        solver.lij_ujk_ = lij_ujk_.data(); 
+        solver.lij_ujk_size_ = lij_ujk_.size(); 
+        solver.lki_nkj_ = lki_nkj_.data(); 
+        solver.lki_nkj_size_ = lki_nkj_.size(); 
+        solver.lkj_uji_ =  lkj_uji_.data(); 
+        solver.lkj_uji_size_ = lkj_uji_.size(); 
+
+        //calling kernelSetup function
+        return micm::cuda::DecomposeKernelDriver(sparseMatrix, solver); 
+    }
+}//end micm
+#endif
diff --git a/include/micm/solver/lu_decomposition.hpp b/include/micm/solver/lu_decomposition.hpp
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
-
 #include <micm/util/sparse_matrix.hpp>
 
 namespace micm
@@ -39,8 +38,9 @@ namespace micm
     /// for each iteration of the outer (i) loop
     std::vector<std::pair<std::size_t, std::size_t>> niLU_;
     /// True when A[i][k] is non-zero for each iteration of the middle (k) loop for the upper
-    /// triangular matrix; False otherwise
-    std::vector<bool> do_aik_;
+    /// triangular matrix; False otherwise. Used data type char instead of bool because vector<bool> representation
+    ///does not suppor easy retrieval of memory address using data() function.
+    std::vector<char> do_aik_;
     /// Index in A.data_ for A[i][k] for each iteration of the middle (k) loop for the upper
     /// triangular matrix when A[i][k] is non-zero
     std::vector<std::size_t> aik_;
@@ -52,10 +52,11 @@ namespace micm
     /// when L[i][j] and U[j][k] are both non-zero.
     std::vector<std::pair<std::size_t, std::size_t>> lij_ujk_;
     /// True when A[k][i] is non-zero for each iteration of the middle (k) loop for the lower
-    /// triangular matrix; False otherwise
-    std::vector<bool> do_aki_;
+    /// triangular matrix; False otherwise. Used data type char instead of bool because vector<bool> representation
+    ///does not suppor easy retrieval of memory address using data() function.
+    std::vector<char> do_aki_;
     /// Index in A.data_ for A[k][i] for each iteration of the middle (k) loop for the lower
-    /// triangular matrix when A[k][i] is non-zero
+    /// triangular matrix when A[k][i] is non-zero. 
     std::vector<std::size_t> aki_;
     /// Index in L.data_ for L[k][i] for each iteration of the middle (k) loop for the lower
     /// triangular matrix when L[k][i] is non-zero, and the corresponding number of elements

diff --git a/include/micm/util/cuda_param.hpp b/include/micm/util/cuda_param.hpp
@@ -1,33 +1,63 @@
+#include<vector>
+
 #ifndef CUDA_PARAM_HPP
 #define CUDA_PARAM_HPP
-// member data of class CUDAProcessSet grouped in struct passing to kernel driver function
-struct CUDAProcessSetParam
-{
-  const size_t* number_of_reactants;
-  const size_t* reactant_ids;
-  size_t reactant_ids_size;
-  const size_t* number_of_products;
-  const size_t* product_ids;
-  size_t product_ids_size;
-  const double* yields;
-  size_t yields_size;
-  const size_t* jacobian_flat_ids;
-  size_t jacobian_flat_ids_size;
-};
-// different matrix data grouped in struct passing to kernel driver function
-struct CUDAMatrixParam
-{
-  const double* rate_constants;
-  const double* state_variables;
-  double* forcing;
-  size_t n_grids;
-  size_t n_reactions;
-  size_t n_species;
-};
-// sparseMatrix data grouped in struct passing to kernel driver function
-struct CUDASparseMatrixParam
-{
-  double* jacobian;
-  size_t jacobian_size;
+  //member data of class CudaProcessSet grouped in struct passing to kernel driver function 
+  const size_t BLOCK_SIZE = 320;
+  struct CudaProcessSetParam{
+    const size_t* number_of_reactants_; 
+    const size_t* reactant_ids_; 
+    size_t reactant_ids_size_; 
+    const size_t* number_of_products_; 
+    const size_t* product_ids_; 
+    size_t product_ids_size_; 
+    const double* yields_; 
+    size_t yields_size_; 
+    const size_t* jacobian_flat_ids_; 
+    size_t jacobian_flat_ids_size_; 
+}; 
+
+  struct CudaSolverParam{
+    const std::pair<size_t, size_t>* niLU_; 
+    size_t niLU_size_;
+    const char* do_aik_;
+    size_t do_aik_size_; 
+    const size_t* aik_;
+    size_t aik_size_;  
+    const std::pair<size_t, size_t>* uik_nkj_; 
+    size_t uik_nkj_size_; 
+    const std::pair<size_t, size_t>* lij_ujk_; 
+    size_t lij_ujk_size_; 
+    const char* do_aki_;
+    size_t do_aki_size_; 
+    const size_t* aki_; 
+    size_t aki_size_;
+    const std::pair<size_t, size_t>* lki_nkj_;
+    size_t lki_nkj_size_; 
+    const std::pair<size_t, size_t>* lkj_uji_;  
+    size_t lkj_uji_size_;  
+    const size_t* uii_; 
+    size_t uii_size_; 
+  };
+ //different matrix data grouped in struct passing to kernel driver function 
+    struct CudaMatrixParam{
+     const double* rate_constants_; 
+     const double* state_variables_; 
+     double* forcing_; 
+     size_t n_grids_; 
+     size_t n_reactions_; 
+     size_t n_species_; 
+}; 
+//sparseMatrix data grouped in struct passing to kernel driver function 
+struct CudaSparseMatrixParam{
+   double* jacobian_; 
+   size_t jacobian_size_; 
+   const double* A_; 
+   size_t A_size_; 
+   double* L_; 
+   size_t L_size_; 
+   double* U_;
+   size_t U_size_; 
+   size_t n_grids_;
 };
 #endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -86,4 +86,5 @@ if(ENABLE_CUDA)
   )
 endif()
 
-add_subdirectory(process)
+add_subdirectory(process)
+add_subdirectory(solver)