From 6e2fd8e7dcac2a45a6ad5300069800ef4fdd8f5d Mon Sep 17 00:00:00 2001 From: John Dennis Date: Mon, 15 Jul 2024 13:51:56 -0600 Subject: [PATCH] Revised calcHTranspose operator (#68) * Initial version with revised H^t operator * Next step in Memory reduction. * More explicit definitions of ints. * Code cleanup for new CSR format for H^t matrix * Verified that this works for the hurricane_4panel case. * More bug fixes for new-H^t operator. * Reduced int length for JH array * Change integer size * Cleanup calculation of nnz * Removed remaining old H^t code. * Minor cleanup of the code. * Cleanup * Clarification on the size of int used in the code. * Fixed OpenACC directive --------- Co-authored-by: Jian Sun --- ncar_scripts/casper_h100_submit.sh | 3 +- ncar_scripts/derecho_a100_submit.sh | 8 +- src/CostFunction.h | 2 +- src/CostFunction3D.cpp | 200 ++++++++++++++-------------- src/CostFunction3D.h | 17 ++- src/precision.h | 1 - 6 files changed, 122 insertions(+), 109 deletions(-) diff --git a/ncar_scripts/casper_h100_submit.sh b/ncar_scripts/casper_h100_submit.sh index f26c78c..403d0eb 100644 --- a/ncar_scripts/casper_h100_submit.sh +++ b/ncar_scripts/casper_h100_submit.sh @@ -27,8 +27,7 @@ cd ncar_scripts # Run a case # ############## suffix="casper_gpu" -for i in beltrami supercell hurricane typhoonChanthu2020 # hurricane_4panel - +for i in beltrami supercell hurricane typhoonChanthu2020 hurricane_4panel do ./ncar_run.sh $SAMURAI_ROOT/ncar_scripts/TDRP/${i}.tdrp >& log_${i}_$suffix.$ID if [ ! -d ${i}_${suffix} ]; then diff --git a/ncar_scripts/derecho_a100_submit.sh b/ncar_scripts/derecho_a100_submit.sh index 750d798..f5ae92e 100644 --- a/ncar_scripts/derecho_a100_submit.sh +++ b/ncar_scripts/derecho_a100_submit.sh @@ -1,7 +1,7 @@ #!/bin/bash -l #PBS -N SAMURAI #PBS -A NEOL0013 -#PBS -l select=1:ncpus=64:ompthreads=1:mem=100GB:ngpus=1 +#PBS -l select=1:ncpus=64:ompthreads=1:mem=300GB:ngpus=1 #PBS -q main #PBS -l walltime=02:30:00 #PBS -j oe @@ -12,6 +12,10 @@ cd .. export SAMURAI_ROOT=$(pwd) ID=`date '+%Y%m%d%H%M'` + +sed -i 's/cc70/cc80/g' CMakeLists.txt +sed -i 's/cc90/cc80/g' CMakeLists.txt + ################## # Build the code # ################## @@ -25,7 +29,7 @@ cd ncar_scripts # Run a case # ############## suffix="derecho_gpu" -for i in beltrami supercell hurricane typhoonChanthu2020 # hurricane_4panel +for i in beltrami supercell hurricane typhoonChanthu2020 # hurricane_4panel do ./ncar_run.sh $SAMURAI_ROOT/ncar_scripts/TDRP/${i}.tdrp >& log_${i}_$suffix.$ID diff --git a/src/CostFunction.h b/src/CostFunction.h index 86d45a4..065fad2 100644 --- a/src/CostFunction.h +++ b/src/CostFunction.h @@ -28,7 +28,7 @@ class CostFunction protected: int ls_cnt; - int64_t mObs; + uint64_t mObs; int nState; real* currState; real* currGradient; diff --git a/src/CostFunction3D.cpp b/src/CostFunction3D.cpp index 61f2cb3..6ecaccc 100644 --- a/src/CostFunction3D.cpp +++ b/src/CostFunction3D.cpp @@ -94,15 +94,18 @@ void CostFunction3D::finalize() delete[] stateB; delete[] stateC; // deallocate Clean-up the data that correspond to the H matrix - #pragma acc exit data delete(mPtr,mVal,I2H) - delete[] mPtr; - delete[] mVal; - delete[] I2H; + #pragma acc exit data delete(H,JH,IH) delete[] H; delete[] JH; delete[] IH; + #pragma acc exit data delete(Ht,JHt,IHt) + delete[] Ht; + delete[] JHt; + delete[] IHt; + + if (basisappx > 0) { delete[] basis0; delete[] basis1; @@ -480,6 +483,7 @@ void CostFunction3D::initState(const int iteration) } // end of iteration == 1 + //JMD variable-interleave for (int var = 0; var < varDim; var++) { // Init node variance for (int iIndex = 0; iIndex < iDim; iIndex++) { @@ -494,6 +498,7 @@ void CostFunction3D::initState(const int iteration) } } + //JMD variable-interleave // Compute and display the variable BG errors and RMS of values for (int var = 0; var < varDim; var++) { real varScale = 0; @@ -747,6 +752,7 @@ void CostFunction3D::updateBG() std::string cFilename = outputPath + "/samurai_Coefficients.out"; ofstream cstream(cFilename); + //JMD variable-interleave cstream << "Variable\tI\tJ\tK\tBackground\tAnalysis\tIncrement\n"; for (int var = 0; var < varDim; var++) { for (int iIndex = 0; iIndex < iDim; iIndex++) { @@ -800,33 +806,30 @@ void CostFunction3D::calcInnovation() void CostFunction3D::calcHTranspose(const real* yhat, real* Astate) { - integer j,n,m,k,ms,me; + uint64_t n,m; + uint64_t j,begin,end; real tmp,val; - #pragma acc data present(yhat,Astate,mPtr,mVal,I2H,H) - { - GPTLstart("CostFunction3D::calcHTranspose"); - #pragma omp parallel for private(n,k,ms,me,tmp,m,j,val) - #pragma acc parallel loop gang vector vector_length(32) private(n,k,ms,me,tmp,m,j,val) - for(n=0;nms){ - for (k=ms;k0;i--) {IHt[i] = IHt[i-1];} + IHt[0]=0; + std::cout << "CostFunction3D::calcHmatrix: After construction of H^t" << std::endl; + // // copy H matrix stuff to the GPU Device - #pragma acc enter data copyin(mPtr,mVal,I2H) - #pragma acc enter data copyin(H[:nonzeros]) - cout << "Memory usage for [obsVector] (Mbytes): " << sizeof(real)*(mObs*(7+varDim*derivDim))/(1024.0*1024.0) << "\n"; - cout << "Memory usage for [obsData] (Mbytes): " << sizeof(real)*(mObs)/(1024.0*1024.0) << "\n"; - cout << "Memory usage for [HCq] (Mbytes): " << sizeof(real)*(mObs+(iDim*jDim*kDim))/(1024.0*1024.0) << "\n"; - cout << "Memory usage for [mPtr,mVal,I2H] (Mbytes): " << sizeof(integer)*(nState+2.*nonzeros+1)/(1024.*1024.) << "\n"; - cout << "Memory usage for [IH,JH] (Mbytes): " << sizeof(integer)*(mObs+nonzeros+1)/(1024.*1024.) << "\n"; - cout << "Memory usage for [state] (Mbytes): " << sizeof(real)*(nState)/(1024.*1024.) << "\n"; - - delete[] Hlength; - delete[] mIncr; - delete[] mTmp; + #pragma acc enter data copyin(H[:nnz]) + std::cout << "Memory usage for [obsVector] (Mbytes): " << sizeof(real)*(mObs*(7+varDim*derivDim))/(1024.0*1024.0) << std::endl; + std::cout << "Memory usage for [obsData] (Mbytes): " << sizeof(real)*(mObs)/(1024.0*1024.0) << std::endl; + std::cout << "Memory usage for [HCq] (Mbytes): " << sizeof(real)*(mObs+(iDim*jDim*kDim))/(1024.0*1024.0) << std::endl; + std::cout << "Memory usage for [IH,JH] (Mbytes): " << (sizeof(uint64_t)*(mObs+1)+ sizeof(int32_t)*nnz)/(1024.*1024.) << std::endl; + std::cout << "Memory usage for [state] (Mbytes): " << sizeof(real)*(nState)/(1024.*1024.) << std::endl; + #pragma acc enter data copyin(Ht[:nnz]) + std::cout << "Memory usage for [IHt,JHt] (Mbytes): " << (sizeof(uint64_t)*(nState+1)+ sizeof(int32_t)*nnz)/(1024.*1024.) << std::endl; + //GPTLstop("CostFunction3D::calcHmatrix:deallocate"); GPTLstop("CostFunction3D::calcHmatrix"); @@ -2663,8 +2667,8 @@ void CostFunction3D::calcHmatrix() void CostFunction3D::Htransform(const real* Cstate, real* Hstate) { - integer i,j; - integer begin,end; + uint64_t i; + uint64_t j,begin,end; real tmp; #pragma acc data present(Cstate,Hstate) diff --git a/src/CostFunction3D.h b/src/CostFunction3D.h index 63ebc00..3d7ab52 100644 --- a/src/CostFunction3D.h +++ b/src/CostFunction3D.h @@ -120,7 +120,7 @@ CostFunction3D(const Projection& proj, const int& numObs = 0, const int& stateSi real* bgState; real* bgStdDev; real* obsVector; - real* obsData; // This only contains data that is needed by the calcHTranspose2 subroutine + real* obsData; // This only contains the necessary data for the calcHTranspose subroutine real* rawObs; real* stateA; real* stateB; @@ -135,10 +135,10 @@ CostFunction3D(const Projection& proj, const int& numObs = 0, const int& stateSi real* iGamma[7]; real* jGamma[7]; real* kGamma[7]; - real* kGammaL; + real* kGammaL; real* kLL; real* finalAnalysis; - int64_t varDim; // NCAR: promoted to 64-bit, since it should auto-promote calculations with it to 64-bit + uint64_t varDim; // NCAR: promoted to 64-bit, since it should auto-promote calculations with it to 64-bit int derivDim; real bgError[7]; int iBCL[7], iBCR[7], jBCL[7], jBCR[7], kBCL[7], kBCR[7]; @@ -151,9 +151,16 @@ CostFunction3D(const Projection& proj, const int& numObs = 0, const int& stateSi double *iFFTin, *jFFTin, *kFFTin; fftw_complex *iFFTout, *jFFTout, *kFFTout; bool UseFFT; + + // explicitly store the H matrix in CSR format real *H; - integer *IH, *I2H,*JH; - integer *mPtr, *mVal; + uint64_t *IH; // Array with extent:(nState+1) can take on values [0 to nonzeros] + uint32_t *JH; // Array with extent:(nonzeros) can take on values [0 to nState-1] + + // explicity store the H^t matrix in CSR format + real *Ht; + uint64_t *IHt; // Array with extent(mObs+1_ can take on values [0 to nonzeros] + uint32_t *JHt; // Array with extent(nonzeros) can take on values [0 to mObs-1] int basisappx; real* basis0; diff --git a/src/precision.h b/src/precision.h index aef4993..3c32b55 100644 --- a/src/precision.h +++ b/src/precision.h @@ -13,6 +13,5 @@ #include typedef double real; -typedef unsigned long int integer; #endif