Skip to content

Commit

Permalink
refactor gpuDILU update
Browse files Browse the repository at this point in the history
  • Loading branch information
multitalentloes committed Jan 17, 2025
1 parent 9cf162d commit 47726cc
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 37 deletions.
59 changes: 37 additions & 22 deletions opm/simulators/linalg/gpuistl/GpuDILU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,8 @@ GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int
}
}

computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
reorderAndSplitMatrix(m_moveThreadBlockSize);
computeDiagonal(m_DILUFactorizationThreadBlockSize);

if (m_tuneThreadBlockSizes) {
tuneThreadBlockSizes();
Expand Down Expand Up @@ -191,7 +192,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
m_gpuDInvFloat->data(),
d.data(),
v.data(),
lowerSolveThreadBlockSize);
lowerSolveThreadBlockSize,
m_stream);
} else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) {
detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float, field_type>(
m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
Expand All @@ -203,7 +205,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
m_gpuDInv.data(),
d.data(),
v.data(),
lowerSolveThreadBlockSize);
lowerSolveThreadBlockSize,
m_stream);
} else {
detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, field_type, field_type>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
Expand Down Expand Up @@ -251,7 +254,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
numOfRowsInLevel,
m_gpuDInvFloat->data(),
v.data(),
upperSolveThreadBlockSize);
upperSolveThreadBlockSize,
m_stream);
} else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG){
detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, float>(
m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
Expand All @@ -262,7 +266,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
numOfRowsInLevel,
m_gpuDInv.data(),
v.data(),
upperSolveThreadBlockSize);
upperSolveThreadBlockSize,
m_stream);
} else {
detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, field_type>(
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
Expand Down Expand Up @@ -315,19 +320,21 @@ GpuDILU<M, X, Y, l>::update()

cudaDeviceSynchronize(); // only for timing
CumulativeScopeTimer timer; // only for timing
update(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
// m_gpuMatrix.updateNonzeroValuesDirectlyInStream(m_cpuMatrix, m_stream); // send updated matrix to the gpu
// if (!m_update_graph_captured)
// {
// m_update_graph = cudaGraph_t();
// m_update_executable_graph = cudaGraphExec_t();
// OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream, cudaStreamCaptureModeGlobal));
// computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
// OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream, &m_update_graph));
// OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_update_executable_graph, m_update_graph, nullptr, nullptr, 0));
// m_update_graph_captured = true;
// }
// OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_update_executable_graph, 0));
m_gpuMatrix.updateNonzeroValuesDirectlyInStream(m_cpuMatrix, m_stream); // send updated matrix to the gpu
reorderAndSplitMatrix(m_moveThreadBlockSize);

if (!m_update_graph_captured)
{
m_update_graph = cudaGraph_t();
m_update_executable_graph = cudaGraphExec_t();
OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream, cudaStreamCaptureModeGlobal));
computeDiagonal(m_DILUFactorizationThreadBlockSize);
OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream, &m_update_graph));
OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_update_executable_graph, m_update_graph, nullptr, nullptr, 0));
m_update_graph_captured = true;
}

OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_update_executable_graph, 0));
cudaDeviceSynchronize(); // only for timing
}

Expand All @@ -338,12 +345,13 @@ void
GpuDILU<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationBlockSize)
{
m_gpuMatrix.updateNonzeroValuesDirectlyInStream(m_cpuMatrix, m_stream); // send updated matrix to the gpu
computeDiagAndMoveReorderedData(moveThreadBlockSize, factorizationBlockSize);
reorderAndSplitMatrix(moveThreadBlockSize);
computeDiagonal(factorizationBlockSize);
}

template <class M, class X, class Y, int l>
void
GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationBlockSize)
GpuDILU<M, X, Y, l>::reorderAndSplitMatrix(int moveThreadBlockSize)
{
if (m_splitMatrix) {
detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
Expand All @@ -369,7 +377,12 @@ GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in
moveThreadBlockSize,
m_stream);
}
}

template <class M, class X, class Y, int l>
void
GpuDILU<M, X, Y, l>::computeDiagonal(int factorizationBlockSize)
{
int levelStartIdx = 0;
for (int level = 0; level < m_levelSets.size(); ++level) {
const int numOfRowsInLevel = m_levelSets[level].size();
Expand All @@ -391,7 +404,8 @@ GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in
m_gpuDInvFloat->data(),
m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
factorizationBlockSize);
factorizationBlockSize,
m_stream);
} else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) {
detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
Expand All @@ -409,7 +423,8 @@ GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in
nullptr,
m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
factorizationBlockSize);
factorizationBlockSize,
m_stream);
} else {
// TODO: should this be field type twice or field type then float in the template?
detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(
Expand Down
5 changes: 4 additions & 1 deletion opm/simulators/linalg/gpuistl/GpuDILU.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,11 @@ class GpuDILU : public Dune::PreconditionerWithUpdate<X, Y>
//! \brief Updates the matrix data.
void update() final;

//! \brief perform matrix splitting and reordering
void reorderAndSplitMatrix(int moveThreadBlockSize);

//! \brief Compute the diagonal of the DILU, and update the data of the reordered matrix
void computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationThreadBlockSize);
void computeDiagonal(int factorizationThreadBlockSize);

//! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
void tuneThreadBlockSizes();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
}
// perform the upper solve for all rows in the same level set
Expand Down Expand Up @@ -357,7 +357,7 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
}

Expand Down Expand Up @@ -415,7 +415,7 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme>, thrBlockSize);
int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme><<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme><<<nThreadBlocks, threadBlockSize, 0, stream>>>(srcReorderedLowerMat,
lowerRowIndices,
lowerColIndices,
srcReorderedUpperMat,
Expand All @@ -437,21 +437,21 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,

// TODO: format
#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \
template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*, int); \
template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*, int, cudaStream_t); \
template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int, cudaStream_t); \
template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>( \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int, cudaStream_t); \
template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int, cudaStream_t); \
template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>( \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int, cudaStream_t); \
template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int, cudaStream_t); \
template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>( \
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \
template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);
const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int, cudaStream_t); \
template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int, cudaStream_t); \
template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int, cudaStream_t);
// template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int); \
// template void solveUpperLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int, cudaStream_t); \
// template void solveLowerLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int, cudaStream_t);
Expand All @@ -471,9 +471,9 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 6);

#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar) \
template void solveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>( \
MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, LinearSolverScalar*, int); \
MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, LinearSolverScalar*, int, cudaStream_t); \
template void solveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>( \
MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, const LinearSolverScalar*, LinearSolverScalar*, int);
MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, const LinearSolverScalar*, LinearSolverScalar*, int, cudaStream_t);

// TODO: be smarter about this... Surely this instantiates many more combinations that are actually needed
#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(blocksize) \
Expand Down

0 comments on commit 47726cc

Please sign in to comment.