Skip to content

Commit

Permalink
Decompose cuda (#257)
Browse files Browse the repository at this point in the history
* work in progress

* work in progress

* test first step

* typo

* changed template

* changed template

* debug

* debug

* added matrixParam to AddForcingTermsKernelDriver

* changed header

* debug

* debug

* debug

* test

* passing vector_matrix directly

* debug

* implemented simple class

* add std::vector

* debug

* ahhh

* passing 3 matrix data as a group

* deleted const

* debug

* debug

* debug

* debug

* debug

* added matrix data in a group for AddJacobianFunction

* debug

* store d_rate_constants in struct pointer

* using cudaMallocManaged

* using cudaMallocManaged

* using cudaMallocManaged

* device strcut

* get all pointers into struct for AddJacobian

* debug

* debug

* passed?

* passing struct of processSet to kernelDriver

* clean up

* removed header

* group processSet data members into struct

* minor fix

* minor fix

* get rid of micm from struct

* debug

* debug

* minor fix

* minor fix

* minor fix

* clean up

* debug

* test

* clean up

* clean up

* clean up

* clean up

* test

* test out for storing pointer to variable

* store reused device pointer to variable in kernel

* store reused device pointer to variable in kernel

* work in progress - 8/31

* added struct for sparseMatrix

* added struct for sparseMatrix

* sparse matrix in struct

* compiling test

* compiling test

* test run for decompose

* test run for decompose function

* edited cmake

* minor fix

* test

* test for passing std::vector to .cu file

* testing for thrust

* moved all vectors

* test

* cleanup

* minor fix

* cleanup

* added header file for vector

* minor fix

* debug

* chnage ngrids in test

* checking on cpu code

* bug

* trying for smaller thrust vector

* vector reallocation failed?

* small bug

* test memory allocation

* clean up vectors

* added const

* debug

* no more vector

* added const

* removed header

* added cmake

* edited header file

* debug

* debug

* change dot operator

* testing

* test

* small bug

* test

* testing

* testing cpu code

* changed kernel config

* changed kernel config

* changed kernel config

* checking pair reference in GPU

* debug

* debug

* bug

* transfer data from device to data

* transfer data

* get rid of i++

* test

* bug

* bug

* bug

* inside the kernel

* debug with print statement

* debug print

* modified test

* print

* print statement

* print

* test

* debug

* debug

* printing aik

* debug

* get rid of ++ inside []

* debug

* changed

* changed

* changed

* test

* test

* print test

* testing for one grid

* testing for one grid

* print A sparseMatrix

* print

* print A outside of kernel

* printing A with pointer

* printing A with pointer

* print A

* check

* print lf

* print lf

* test

* added synchronization

* try to print with hd

* test

* check

* check

* test

* print

* test

* checking niLU second

* test

* check

* test

* debug

* test

* test

* added more print

* added more print

* test

* test

* more debug

* more print

* test

* more print

* more print

* more print

* more print

* more print

* more print

* more print

* more print

* test

* more print

* test

* more print

* more print

* more print

* more print

* more print

* test

* test

* test

* print

* print

* print

* pass? cleanup

* changed to expect_near to expect_eq

* changed back to expect_near

* used constructor from base class

* eliminated dervied class constructor

* added underscore to all struct members

* added more underscore

* added more underscore

* added more underscore

* more modification

* added time measurement

* GPU test against CPU

* added time measurement

* added time measurement

* vector

* addSparseMatrix template

* test

* typo

* added std to vector

* minor change

* incremental change made to processSet

* modified cuda_process_set.cuh

* changed to device struct
  • Loading branch information
qinatan authored Oct 4, 2023
1 parent 2b3be9c commit 63ace0f
Show file tree
Hide file tree
Showing 13 changed files with 636 additions and 196 deletions.
14 changes: 9 additions & 5 deletions include/micm/process/cuda_process_set.cuh
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
// Copyright (C) 2023 National Center for Atmospheric Research,
//
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include <micm/util/cuda_param.hpp>
#include <micm/util/cuda_param.hpp>

namespace micm
{
namespace cuda
{
std::chrono::nanoseconds AddForcingTermsKernelDriver(
CUDAMatrixParam& matrixParam,
CUDAProcessSetParam& processSet);
CudaMatrixParam& matrixParam,
CudaProcessSetParam& processSet);

std::chrono::nanoseconds AddJacobianTermsKernelDriver(
CUDAMatrixParam& matrixParam,
CUDASparseMatrixParam& sparseMatrix,
CUDAProcessSetParam& processSet);
CudaMatrixParam& matrixParam,
CudaSparseMatrixParam& sparseMatrix,
CudaProcessSetParam& processSet);
} // namespace cuda
} // namespace micm
85 changes: 45 additions & 40 deletions include/micm/process/cuda_process_set.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include <micm/process/process_set.hpp>
#include <micm/util/cuda_param.hpp>
#include <micm/util/cuda_param.hpp>

#ifdef USE_CUDA
# include <micm/process/cuda_process_set.cuh>
Expand Down Expand Up @@ -52,25 +53,27 @@ namespace micm
const MatrixPolicy<double>& state_variables,
MatrixPolicy<double>& forcing) const
{
CUDAMatrixParam matrixParam;
matrixParam.rate_constants = rate_constants.AsVector().data();
matrixParam.state_variables = state_variables.AsVector().data();
matrixParam.forcing = forcing.AsVector().data();
matrixParam.n_grids = rate_constants.size();
matrixParam.n_reactions = rate_constants[0].size();
matrixParam.n_species = state_variables[0].size();
CudaMatrixParam matrix;
matrix.rate_constants_ = rate_constants.AsVector().data();
matrix.state_variables_ = state_variables.AsVector().data();
matrix.forcing_ = forcing.AsVector().data();
matrix.n_grids_ = rate_constants.size();
matrix.n_reactions_ = rate_constants[0].size();
matrix.n_species_ = state_variables[0].size();

CUDAProcessSetParam processSet;
processSet.number_of_reactants = number_of_reactants_.data();
processSet.reactant_ids = reactant_ids_.data();
processSet.reactant_ids_size = reactant_ids_.size();
processSet.number_of_products = number_of_products_.data();
processSet.product_ids = product_ids_.data();
processSet.product_ids_size = product_ids_.size();
processSet.yields = yields_.data();
processSet.yields_size = yields_.size();
CudaProcessSetParam processSet;
processSet.number_of_reactants_ = number_of_reactants_.data();
processSet.reactant_ids_ = reactant_ids_.data();
processSet.reactant_ids_size_ = reactant_ids_.size();
processSet.number_of_products_ = number_of_products_.data();
processSet.product_ids_ = product_ids_.data();
processSet.product_ids_size_ = product_ids_.size();
processSet.yields_ = yields_.data();
processSet.yields_size_ = yields_.size();

std::chrono::nanoseconds kernel_duration = micm::cuda::AddForcingTermsKernelDriver(matrixParam, processSet);
std::chrono::nanoseconds kernel_duration = micm::cuda::AddForcingTermsKernelDriver(
matrix,
processSet);
return kernel_duration; // time performance of kernel function
}
template<template<class> class MatrixPolicy, template<class> class SparseMatrixPolicy>
Expand All @@ -80,29 +83,31 @@ namespace micm
const MatrixPolicy<double>& state_variables,
SparseMatrixPolicy<double>& jacobian) const
{
CUDAMatrixParam matrixParam;
matrixParam.rate_constants = rate_constants.AsVector().data();
matrixParam.state_variables = state_variables.AsVector().data();
matrixParam.n_grids = rate_constants.size();
matrixParam.n_reactions = rate_constants[0].size();
matrixParam.n_species = state_variables[0].size();

CUDASparseMatrixParam sparseMatrix;
sparseMatrix.jacobian = jacobian.AsVector().data();
sparseMatrix.jacobian_size = jacobian.AsVector().size();

CUDAProcessSetParam processSet;
processSet.number_of_reactants = number_of_reactants_.data();
processSet.reactant_ids = reactant_ids_.data();
processSet.reactant_ids_size = reactant_ids_.size();
processSet.number_of_products = number_of_products_.data();
processSet.yields = yields_.data();
processSet.yields_size = yields_.size();
processSet.jacobian_flat_ids = jacobian_flat_ids_.data();
processSet.jacobian_flat_ids_size = jacobian_flat_ids_.size();

std::chrono::nanoseconds kernel_duration =
micm::cuda::AddJacobianTermsKernelDriver(matrixParam, sparseMatrix, processSet);
CudaMatrixParam matrix;
matrix.rate_constants_ = rate_constants.AsVector().data();
matrix.state_variables_ = state_variables.AsVector().data();
matrix.n_grids_ = rate_constants.size();
matrix.n_reactions_ = rate_constants[0].size();
matrix.n_species_ = state_variables[0].size();

CudaSparseMatrixParam sparseMatrix;
sparseMatrix.jacobian_ = jacobian.AsVector().data();
sparseMatrix.jacobian_size_ = jacobian.AsVector().size();

CudaProcessSetParam processSet;
processSet.number_of_reactants_ = number_of_reactants_.data();
processSet.reactant_ids_ = reactant_ids_.data();
processSet.reactant_ids_size_ = reactant_ids_.size();
processSet.number_of_products_ = number_of_products_.data();
processSet.yields_ = yields_.data();
processSet.yields_size_ = yields_.size();
processSet.jacobian_flat_ids_ = jacobian_flat_ids_.data();
processSet.jacobian_flat_ids_size_ = jacobian_flat_ids_.size();

std::chrono::nanoseconds kernel_duration = micm::cuda::AddJacobianTermsKernelDriver(
matrix,
sparseMatrix,
processSet);
return kernel_duration; // time performance of kernel function
}
} // namespace micm
Expand Down
15 changes: 15 additions & 0 deletions include/micm/solver/cuda_lu_decomposition.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// Copyright (C) 2023 National Center for Atmospheric Research,
//
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include <micm/util/cuda_param.hpp>
#include <vector>
namespace micm
{
namespace cuda
{
std::chrono::nanoseconds DecomposeKernelDriver(
CudaSparseMatrixParam& sparseMatrix,
CudaSolverParam& solver);
} // namespace cuda
} // namespace micm
70 changes: 70 additions & 0 deletions include/micm/solver/cuda_lu_decomposition.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
// Copyright (C) 2023 National Center for Atmospheric Research,
//
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include<micm/solver/lu_decomposition.hpp>
#include<micm/util/cuda_param.hpp>
#include <stdexcept>
#include <chrono>
#ifdef USE_CUDA
#include <micm/solver/cuda_lu_decomposition.cuh>
#endif

#ifdef USE_CUDA
namespace micm{
class CudaLuDecomposition: public LuDecomposition{
public:
CudaLuDecomposition(){};

template<typename T, template<class> typename SparseMatrixPolicy>
requires VectorizableSparse<SparseMatrixPolicy<T>>
std::chrono::nanoseconds Decompose(
const SparseMatrixPolicy<T>&A,
SparseMatrixPolicy<T>& L,
SparseMatrixPolicy<T>& U) const;
};

template<typename T, template<class> class SparseMatrixPolicy>
requires(VectorizableSparse<SparseMatrixPolicy<T>>)
std::chrono::nanoseconds CudaLuDecomposition::Decompose(
const SparseMatrixPolicy<T>& A,
SparseMatrixPolicy<T>& L,
SparseMatrixPolicy<T>& U) const
{
CudaSparseMatrixParam sparseMatrix;
sparseMatrix.A_ = A.AsVector().data();
sparseMatrix.A_size_ = A.AsVector().size();
sparseMatrix.L_ = L.AsVector().data();
sparseMatrix.L_size_ = L.AsVector().size();
sparseMatrix.U_ = U.AsVector().data();
sparseMatrix.U_size_ = U.AsVector().size();
sparseMatrix.n_grids_ = A.size();

CudaSolverParam solver;
solver.do_aik_ = do_aik_.data();
solver.do_aik_size_ = do_aik_.size();
solver.aik_ = aik_.data();
solver.aik_size_ = aik_.size();
solver.do_aki_ = do_aki_.data();
solver.do_aki_size_ = do_aki_.size();
solver.aki_ = aki_.data();
solver.aki_size_ = aki_.size();
solver.uii_ = uii_.data();
solver.uii_size_ = uii_.size();

solver.niLU_ = niLU_.data();
solver.niLU_size_ = niLU_.size();
solver.uik_nkj_ = uik_nkj_.data();
solver.uik_nkj_size_ = uik_nkj_.size();
solver.lij_ujk_ = lij_ujk_.data();
solver.lij_ujk_size_ = lij_ujk_.size();
solver.lki_nkj_ = lki_nkj_.data();
solver.lki_nkj_size_ = lki_nkj_.size();
solver.lkj_uji_ = lkj_uji_.data();
solver.lkj_uji_size_ = lkj_uji_.size();

//calling kernelSetup function
return micm::cuda::DecomposeKernelDriver(sparseMatrix, solver);
}
}//end micm
#endif
13 changes: 7 additions & 6 deletions include/micm/solver/lu_decomposition.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include <micm/util/sparse_matrix.hpp>

namespace micm
Expand Down Expand Up @@ -39,8 +38,9 @@ namespace micm
/// for each iteration of the outer (i) loop
std::vector<std::pair<std::size_t, std::size_t>> niLU_;
/// True when A[i][k] is non-zero for each iteration of the middle (k) loop for the upper
/// triangular matrix; False otherwise
std::vector<bool> do_aik_;
/// triangular matrix; False otherwise. Used data type char instead of bool because vector<bool> representation
///does not suppor easy retrieval of memory address using data() function.
std::vector<char> do_aik_;
/// Index in A.data_ for A[i][k] for each iteration of the middle (k) loop for the upper
/// triangular matrix when A[i][k] is non-zero
std::vector<std::size_t> aik_;
Expand All @@ -52,10 +52,11 @@ namespace micm
/// when L[i][j] and U[j][k] are both non-zero.
std::vector<std::pair<std::size_t, std::size_t>> lij_ujk_;
/// True when A[k][i] is non-zero for each iteration of the middle (k) loop for the lower
/// triangular matrix; False otherwise
std::vector<bool> do_aki_;
/// triangular matrix; False otherwise. Used data type char instead of bool because vector<bool> representation
///does not suppor easy retrieval of memory address using data() function.
std::vector<char> do_aki_;
/// Index in A.data_ for A[k][i] for each iteration of the middle (k) loop for the lower
/// triangular matrix when A[k][i] is non-zero
/// triangular matrix when A[k][i] is non-zero.
std::vector<std::size_t> aki_;
/// Index in L.data_ for L[k][i] for each iteration of the middle (k) loop for the lower
/// triangular matrix when L[k][i] is non-zero, and the corresponding number of elements
Expand Down
88 changes: 59 additions & 29 deletions include/micm/util/cuda_param.hpp
Original file line number Diff line number Diff line change
@@ -1,33 +1,63 @@
#include<vector>

#ifndef CUDA_PARAM_HPP
#define CUDA_PARAM_HPP
// member data of class CUDAProcessSet grouped in struct passing to kernel driver function
struct CUDAProcessSetParam
{
const size_t* number_of_reactants;
const size_t* reactant_ids;
size_t reactant_ids_size;
const size_t* number_of_products;
const size_t* product_ids;
size_t product_ids_size;
const double* yields;
size_t yields_size;
const size_t* jacobian_flat_ids;
size_t jacobian_flat_ids_size;
};
// different matrix data grouped in struct passing to kernel driver function
struct CUDAMatrixParam
{
const double* rate_constants;
const double* state_variables;
double* forcing;
size_t n_grids;
size_t n_reactions;
size_t n_species;
};
// sparseMatrix data grouped in struct passing to kernel driver function
struct CUDASparseMatrixParam
{
double* jacobian;
size_t jacobian_size;
//member data of class CudaProcessSet grouped in struct passing to kernel driver function
const size_t BLOCK_SIZE = 320;
struct CudaProcessSetParam{
const size_t* number_of_reactants_;
const size_t* reactant_ids_;
size_t reactant_ids_size_;
const size_t* number_of_products_;
const size_t* product_ids_;
size_t product_ids_size_;
const double* yields_;
size_t yields_size_;
const size_t* jacobian_flat_ids_;
size_t jacobian_flat_ids_size_;
};

struct CudaSolverParam{
const std::pair<size_t, size_t>* niLU_;
size_t niLU_size_;
const char* do_aik_;
size_t do_aik_size_;
const size_t* aik_;
size_t aik_size_;
const std::pair<size_t, size_t>* uik_nkj_;
size_t uik_nkj_size_;
const std::pair<size_t, size_t>* lij_ujk_;
size_t lij_ujk_size_;
const char* do_aki_;
size_t do_aki_size_;
const size_t* aki_;
size_t aki_size_;
const std::pair<size_t, size_t>* lki_nkj_;
size_t lki_nkj_size_;
const std::pair<size_t, size_t>* lkj_uji_;
size_t lkj_uji_size_;
const size_t* uii_;
size_t uii_size_;
};
//different matrix data grouped in struct passing to kernel driver function
struct CudaMatrixParam{
const double* rate_constants_;
const double* state_variables_;
double* forcing_;
size_t n_grids_;
size_t n_reactions_;
size_t n_species_;
};
//sparseMatrix data grouped in struct passing to kernel driver function
struct CudaSparseMatrixParam{
double* jacobian_;
size_t jacobian_size_;
const double* A_;
size_t A_size_;
double* L_;
size_t L_size_;
double* U_;
size_t U_size_;
size_t n_grids_;
};
#endif
3 changes: 2 additions & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -86,4 +86,5 @@ if(ENABLE_CUDA)
)
endif()

add_subdirectory(process)
add_subdirectory(process)
add_subdirectory(solver)
Loading

0 comments on commit 63ace0f

Please sign in to comment.