From c0941a81164e14ecc4fba415e4964cc4282b561f Mon Sep 17 00:00:00 2001 From: Dan Giles Date: Wed, 14 Jul 2021 13:43:05 +0100 Subject: [PATCH 1/4] Test version of the spherical polar --- sp/NumericalFluxes_sph.h | 17 +-- sp/computeFluxes.h | 8 +- sp/computeFluxes_sph.h | 222 +++++++++++++++++++-------------------- volna_init/mesh.hpp | 84 +++++++++++---- 4 files changed, 187 insertions(+), 144 deletions(-) diff --git a/sp/NumericalFluxes_sph.h b/sp/NumericalFluxes_sph.h index beee7f0..58946e5 100644 --- a/sp/NumericalFluxes_sph.h +++ b/sp/NumericalFluxes_sph.h @@ -1,25 +1,26 @@ inline void NumericalFluxes_sph(float *left, //OP_INC float *right, //OP_INC const float *leftcellCenters, const float *rightcellCenters, + // const float *edgeCenters, const float *edgeFluxes, //OP_READ const float *bathySource, //OP_READ const float *edgeNormals, const int *isRightBoundary, const float *cellVolumes0, //OP_READ const float *cellVolumes1) //OP_READ) { - left[0] -= (edgeFluxes[0])/(Radius*cellVolumes0[0]); + left[0] -= (edgeFluxes[0])/(Radius*cellVolumes0[0]*cos(M_PI*leftcellCenters[1]/180.0)); left[1] -= (edgeFluxes[1] + bathySource[0] * edgeNormals[0])/(Radius*cellVolumes0[0]*cos(M_PI*leftcellCenters[1]/180.0)); - left[2] -= (edgeFluxes[2] + bathySource[0] * edgeNormals[1])/(Radius*cellVolumes0[0]); + left[2] -= (edgeFluxes[2] + bathySource[0] * edgeNormals[1])/(Radius*cellVolumes0[0]*cos(M_PI*leftcellCenters[1]/180.0)); // Added centered Source Term - left[1] += (bathySource[2] *edgeNormals[0])/(Radius*cellVolumes0[0]*cos(M_PI*leftcellCenters[1]/180.0)); - left[2] += (bathySource[2] *edgeNormals[1])/(Radius*cellVolumes0[0]); + // left[1] += (bathySource[2] *edgeNormals[0])/(Radius*cellVolumes0[0]*cos(M_PI*46.0/180.0));//*cos(M_PI*leftcellCenters[1]/180.0)); + left[2] += (bathySource[2] *edgeNormals[1])/(Radius*cellVolumes0[0]*cos(M_PI*leftcellCenters[1]/180.0)); if (!*isRightBoundary) { - right[0] += edgeFluxes[0]/cellVolumes1[0]; + right[0] += (edgeFluxes[0])/(Radius*cellVolumes1[0]*cos(M_PI*rightcellCenters[1]/180.0)); right[1] += (edgeFluxes[1] + bathySource[1] * edgeNormals[0])/(Radius*cellVolumes1[0]*cos(M_PI*rightcellCenters[1]/180.0)); - right[2] += (edgeFluxes[2] + bathySource[1] * edgeNormals[1])/(Radius*cellVolumes1[0]); + right[2] += (edgeFluxes[2] + bathySource[1] * edgeNormals[1])/(Radius*cellVolumes1[0]*cos(M_PI*rightcellCenters[1]/180.0)); // Added centered Source Term - right[1] -= (bathySource[3] *edgeNormals[0])/(Radius*cellVolumes1[0]*cos(M_PI*rightcellCenters[1]/180.0)); - right[2] -= (bathySource[3] *edgeNormals[1])/(Radius*cellVolumes1[0]); + // right[1] -= (bathySource[3] *edgeNormals[0])/(Radius*cellVolumes1[0]*cos(M_PI*46.0/180.0));//*cos(M_PI*rightcellCenters[1]/180.0)); + right[2] -= (bathySource[3] *edgeNormals[1])/(Radius*cellVolumes1[0]*cos(M_PI*rightcellCenters[1]/180.0)); } else { right[0] += 0.0f; right[1] += 0.0f; diff --git a/sp/computeFluxes.h b/sp/computeFluxes.h index dde89ba..748746c 100644 --- a/sp/computeFluxes.h +++ b/sp/computeFluxes.h @@ -29,7 +29,7 @@ inline void computeFluxes(const float *cellLeft, const float *cellRight, leftCellValues[1] += alphaleft[0] * ((dxl * leftGradient[2])+(dyl * leftGradient[3])); leftCellValues[2] += alphaleft[0] * ((dxl * leftGradient[4])+(dyl * leftGradient[5])); leftCellValues[3] += alphaleft[0] * ((dxl * leftGradient[6])+(dyl * leftGradient[7])); - if (leftCellValues[0] >= 1e-3){ + if (leftCellValues[0] >= EPS){ uL = leftCellValues[1]/leftCellValues[0]; vL = leftCellValues[2]/leftCellValues[0]; } else { @@ -49,7 +49,7 @@ inline void computeFluxes(const float *cellLeft, const float *cellRight, rightCellValues[1] += alpharight[0] * ((dxr * rightGradient[2])+(dyr * rightGradient[3])); rightCellValues[2] += alpharight[0] * ((dxr * rightGradient[4])+(dyr * rightGradient[5])); rightCellValues[3] += alpharight[0] * ((dxr * rightGradient[6])+(dyr * rightGradient[7])); - if (rightCellValues[0] >= 1e-3){ + if (rightCellValues[0] >= EPS){ uR = rightCellValues[1]/rightCellValues[0]; vR = rightCellValues[2]/rightCellValues[0]; } else { @@ -135,12 +135,12 @@ inline void computeFluxes(const float *cellLeft, const float *cellRight, sStar = (sL*hR*(uRn - sR) - sR*hL*(uLn - sL))/ (hR*(uRn - sR) - hL*(uLn - sL)); - if ((leftCellValues[0] <= EPS) && (rightCellValues[0] > EPS)) { + if ((hL <= EPS) && (hR > EPS)) { sL = uRn - 2.0f*cR; sR = uRn + cR; sStar = sL; } - if ((rightCellValues[0] <= EPS) && (leftCellValues[0] > EPS)) { + if ((hR <= EPS) && (hL > EPS)) { sR = uLn + 2.0f*cL; sL = uLn - cL; sStar = sR; diff --git a/sp/computeFluxes_sph.h b/sp/computeFluxes_sph.h index 13c07dd..f5133be 100644 --- a/sp/computeFluxes_sph.h +++ b/sp/computeFluxes_sph.h @@ -19,17 +19,18 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight, leftCellValues[2] = cellLeft[2]; leftCellValues[3] = cellLeft[3]; float dxl, dyl, dxr, dyr; - dxl = (edgeCenters[0] - leftcellCenters[0]); - dyl = (edgeCenters[1] - leftcellCenters[1]); - dxr = (edgeCenters[0] - rightcellCenters[0]); - dyr = (edgeCenters[1] - rightcellCenters[1]); + // Convert to arc length + dxl = (edgeCenters[0] - leftcellCenters[0])*Radius; + dyl = (edgeCenters[1] - leftcellCenters[1])*Radius; + dxr = (edgeCenters[0] - rightcellCenters[0])*Radius; + dyr = (edgeCenters[1] - rightcellCenters[1])*Radius; // Second order Reconstruction - leftCellValues[0] += alphaleft[0] * ((dxl * leftGradient[0])+(dyl * leftGradient[1])); - leftCellValues[1] += alphaleft[0] * ((dxl * leftGradient[2])+(dyl * leftGradient[3])); - leftCellValues[2] += alphaleft[0] * ((dxl * leftGradient[4])+(dyl * leftGradient[5])); - leftCellValues[3] += alphaleft[0] * ((dxl * leftGradient[6])+(dyl * leftGradient[7])); - if (leftCellValues[0] >= 1e-3){ + // leftCellValues[0] += alphaleft[0] * ((dxl * leftGradient[0])+(dyl * leftGradient[1])); + // leftCellValues[1] += alphaleft[0] * ((dxl * leftGradient[2])+(dyl * leftGradient[3])); + // leftCellValues[2] += alphaleft[0] * ((dxl * leftGradient[4])+(dyl * leftGradient[5])); + // leftCellValues[3] += alphaleft[0] * ((dxl * leftGradient[6])+(dyl * leftGradient[7])); + if (leftCellValues[0] >= EPS){ uL = leftCellValues[1]/leftCellValues[0]; vL = leftCellValues[2]/leftCellValues[0]; } else { @@ -37,6 +38,7 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight, vL = 0.0f; } zL = cellLeft[3] - cellLeft[0]; + zR = cellRight[3] - cellRight[0]; if (!*isRightBoundary) { rightCellValues[0] = cellRight[0]; @@ -45,11 +47,11 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight, rightCellValues[3] = cellRight[3]; // Second order Reconstruction - rightCellValues[0] += alpharight[0] * ((dxr * rightGradient[0])+(dyr * rightGradient[1])); - rightCellValues[1] += alpharight[0] * ((dxr * rightGradient[2])+(dyr * rightGradient[3])); - rightCellValues[2] += alpharight[0] * ((dxr * rightGradient[4])+(dyr * rightGradient[5])); - rightCellValues[3] += alpharight[0] * ((dxr * rightGradient[6])+(dyr * rightGradient[7])); - if (rightCellValues[0] >= 1e-3){ + // rightCellValues[0] += alpharight[0] * ((dxr * rightGradient[0])+(dyr * rightGradient[1])); + // rightCellValues[1] += alpharight[0] * ((dxr * rightGradient[2])+(dyr * rightGradient[3])); + // rightCellValues[2] += alpharight[0] * ((dxr * rightGradient[4])+(dyr * rightGradient[5])); + // rightCellValues[3] += alpharight[0] * ((dxr * rightGradient[6])+(dyr * rightGradient[7])); + if (rightCellValues[0] >= EPS){ uR = rightCellValues[1]/rightCellValues[0]; vR = rightCellValues[2]/rightCellValues[0]; } else { @@ -57,37 +59,40 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight, vR = 0.0f; } } else { - float nx = edgeNormals[0]; - float ny = edgeNormals[1]; - float inNormalVelocity = uL * nx + vL * ny; - float inTangentVelocity = -1.0f * uL * ny + vL * nx; - float outNormalVelocity = 0.0f; - float outTangentVelocity = 0.0f; - //rightCellValues[3] = leftCellValues[3]; - // Outflow - // float wet = (fabs(*zmin) - zL) > 0.0f ? (fabs(*zmin) - zL) : EPS; - // rightCellValues[3] = zL + wet; - // rightCellValues[0] = wet; - rightCellValues[3] = leftCellValues[3]; - rightCellValues[0] = leftCellValues[0]; - outNormalVelocity = inNormalVelocity; - outTangentVelocity = inTangentVelocity; - uR = outNormalVelocity * nx - outTangentVelocity * ny; - vR = outNormalVelocity * ny + outTangentVelocity * nx; - - // Wall - // rightCellValues[3] = leftCellValues[3]; - // rightCellValues[0] = leftCellValues[0]; - // outNormalVelocity = -1.0f*inNormalVelocity; - // outTangentVelocity = inTangentVelocity; - // uR = outNormalVelocity * nx - outTangentVelocity * ny; - // vR = outNormalVelocity * ny + outTangentVelocity * nx; + float nx = edgeNormals[0]; + float ny = edgeNormals[1]; + float inNormalVelocity = uL * nx + vL * ny; + float inTangentVelocity = -1.0f * uL * ny + vL * nx; + float outNormalVelocity = 0.0f; + float outTangentVelocity = 0.0f; - - rightCellValues[1] = uR*rightCellValues[0]; - rightCellValues[2] = vR*rightCellValues[0]; + // Outflow + float wet = (fabs(*zmin) - zL) > 0.0f ? (fabs(*zmin) - zL) : EPS; + float critical = sqrt(uL*uL + vL*vL); + outTangentVelocity = inTangentVelocity; + if (critical < sqrt(g*leftCellValues[0])){ + rightCellValues[0] = wet; + rightCellValues[3] = wet + zL; + outNormalVelocity = 0.0f; + } else { + rightCellValues[0] = leftCellValues[0]; + rightCellValues[3] = leftCellValues[3]; + outNormalVelocity = inNormalVelocity; + } + /* + // Wall + rightCellValues[3] = leftCellValues[3]; + rightCellValues[0] = leftCellValues[0]; + outNormalVelocity = -1.0f*inNormalVelocity; + outTangentVelocity = inTangentVelocity; + */ + uR = outNormalVelocity * nx - outTangentVelocity * ny; + vR = outNormalVelocity * ny + outTangentVelocity * nx; + rightCellValues[1] = uR*rightCellValues[0]; + rightCellValues[2] = vR*rightCellValues[0]; + zR = zL; } - zR = cellRight[3] - cellRight[0]; + rightCellValues[3] -= rightCellValues[0]; leftCellValues[3] -= leftCellValues[0]; // ------------------------------------------------------------------------------------ @@ -104,10 +109,11 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight, hR = hR > 0.0f ? hR : 0.0f; bathySource[0] -= .5f * g * (hL * hL); bathySource[1] -= .5f * g * (hR * hR); - // Audusse Reconstruction(2005) 2nd order Centered term - bathySource[2] = -.5f * g *(leftCellValues[0] + cellLeft[0])*(leftCellValues[3] - zL); - bathySource[3] = -.5f * g *(rightCellValues[0] + cellRight[0])*(rightCellValues[3] - zR); - + // Audusse Reconstruction(2005) 2nd order centered term + // bathySource[2] = -.5f * g *(leftCellValues[0] + cellLeft[0])*(leftCellValues[3] - zL); + // bathySource[3] = -.5f * g *(rightCellValues[0] + cellRight[0])*(rightCellValues[3] - zR); + bathySource[2] = .5f * g * (hL * hL)*sin(M_PI*leftcellCenters[1]/180.0); + bathySource[3] = .5f * g * (hR * hR)*sin(M_PI*rightcellCenters[1]/180.0); bathySource[0] *= *edgeLength; bathySource[1] *= *edgeLength; bathySource[2] *= *edgeLength; @@ -134,14 +140,12 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight, sStar = (sL*hR*(uRn - sR) - sR*hL*(uLn - sL))/ (hR*(uRn - sR) - hL*(uLn - sL)); - //if ((leftCellValues[0] <= EPS) && (rightCellValues[0] > EPS)) { - if ((leftCellValues[0] <= 1e-3) && (rightCellValues[0] > 1e-3)) { + if ((hL <= EPS) && (hR > EPS)) { sL = uRn - 2.0f*cR; sR = uRn + cR; sStar = sL; } - //if ((rightCellValues[0] <= EPS) && (leftCellValues[0] > EPS)) { - if ((rightCellValues[0] <= 1e-3) && (leftCellValues[0] > 1e-3)) { + if ((hR <= EPS) && (hL > EPS)) { sR = uLn + 2.0f*cL; sL = uLn - cL; sStar = sR; @@ -152,35 +156,29 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight, float uRp = vR*edgeNormals[0] - uR*edgeNormals[1]; float LeftFluxes_H, LeftFluxes_U, LeftFluxes_V, LeftFluxes_N; - //inlined ProjectedPhysicalFluxes(leftCellValues, Normals, params, LeftFluxes); - float HuDotN = (leftCellValues[1]/cos(M_PI*leftcellCenters[1]/180.0)) * edgeNormals[0] + - (leftCellValues[2]) * edgeNormals[1]; - + // float HuDotN = (hL*uL) * edgeNormals[0] + (hL*vL) * edgeNormals[1]; + float HuDotN = (hL*uL) * edgeNormals[0] + (hL*vL*cos(M_PI*edgeCenters[1]/180.0)) * edgeNormals[1]; LeftFluxes_H = HuDotN; LeftFluxes_U = HuDotN * uL; LeftFluxes_V = HuDotN * vL; // Normal Momentum flux term LeftFluxes_N = HuDotN * uLn; - LeftFluxes_U += ((.5f * g * edgeNormals[0] ) * ( hL * hL ))/(cos(M_PI*leftcellCenters[1]/180.0)); - LeftFluxes_V += (.5f * g * edgeNormals[1] ) * ( hL * hL ); + LeftFluxes_U += (.5f * g * edgeNormals[0] ) * ( hL * hL); + LeftFluxes_V += (.5f * g * edgeNormals[1] ) * ( hL * hL* cos(M_PI*edgeCenters[1]/180.0)); LeftFluxes_N += (.5f * g ) * ( hL * hL ); - //end of inlined float RightFluxes_H, RightFluxes_U, RightFluxes_V, RightFluxes_N; - //inlined ProjectedPhysicalFluxes(rightCellValues, Normals, params, RightFluxes); - HuDotN = (rightCellValues[1]/cos(M_PI*rightcellCenters[1]/180.0)) * edgeNormals[0] + - (rightCellValues[2]) * edgeNormals[1]; - + HuDotN = (hR*uR) * edgeNormals[0] + (hR*vR*cos(M_PI*edgeCenters[1]/180.0)) * edgeNormals[1]; + // HuDotN = (hR*uR) * edgeNormals[0] + (hR*vR) * edgeNormals[1]; RightFluxes_H = HuDotN; RightFluxes_U = HuDotN * uR; RightFluxes_V = HuDotN * vR; // Normal Momentum flux term RightFluxes_N = HuDotN * uRn; - - RightFluxes_U += ((.5f * g * edgeNormals[0] ) * ( hR * hR ))/(cos(M_PI*rightcellCenters[1]/180.0)); - RightFluxes_V += (.5f * g * edgeNormals[1] ) * ( hR * hR ); - // RightFluxes_N += (.5f * g ) * ( hR * hR ); + RightFluxes_U += (.5f * g * edgeNormals[0] ) * ( hR * hR); + RightFluxes_V += (.5f * g * edgeNormals[1] ) * ( hR * hR* cos(M_PI*edgeCenters[1]/180.0)); + RightFluxes_N += (.5f * g ) * ( hR * hR ); // ------------------------------------------------------------------------ // HLLC Flux Solver (Batten et al. 1997) // "On the choice of wavespeeds for the HLLC Reimann solver" @@ -212,7 +210,7 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight, //------------------------------------------------------------------------ // HLL Flux Solver // ------------------------------------------------------------------------ - /*float sLMinus = sL < 0.0f ? sL : 0.0f; + float sLMinus = sL < 0.0f ? sL : 0.0f; float sRPlus = sR > 0.0f ? sR : 0.0f; float sRMinussL = sRPlus - sLMinus; sRMinussL = sRMinussL < EPS ? EPS : sRMinussL; @@ -223,67 +221,65 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight, ( t1 * LeftFluxes_H ) + ( t2 * RightFluxes_H ) + ( t3 * ( hR - hL ) ); - + // op_printf("out %f \n", out[0]); out[1] = ( t1 * LeftFluxes_U ) + ( t2 * RightFluxes_U ) + - ( t3 * ( (rightCellValues[1]) - - (leftCellValues[1]) ) ); + ( t3 * ( (hR*uR) - (hL*uL) ) ); out[2] = ( t1 * LeftFluxes_V ) + ( t2 * RightFluxes_V ) + - ( t3 * ( (rightCellValues[2]) - - (leftCellValues[2]) ) ); - */ + ( t3 * ( (hR*vR) - (hL*vL) ) ); + // ------------------------------------------------------------------------ // HLLC Flux Solver (Huang et al. 2013) // "Well-balanced finite volume scheme for shallow water flooding and drying // over arbitrary topography" // ------------------------------------------------------------------------ - float sLMinus = sL < 0.0f ? sL : 0.0f; - float sRPlus = sR > 0.0f ? sR : 0.0f; - float sRMinussL = sRPlus - sLMinus; - sRMinussL = sRMinussL < EPS ? EPS : sRMinussL; - float t1 = sRPlus / sRMinussL; - float t2 = ( -1.0 * sLMinus ) / sRMinussL; - float t3 = ( sRPlus * sLMinus ) / sRMinussL; - float FStar[3]; - FStar[0] = - ( t1 * LeftFluxes_H ) + - ( t2 * RightFluxes_H ) + - ( t3 * ( hR - hL ) ); - - - FStar[1] = - ( t1 * LeftFluxes_N ) + - ( t2 * RightFluxes_N ) + - ( t3 * ( (hR * uRn) - - (hL * uLn) ) ); - - if( sL >= 0.0f) { - out[0] = t1*LeftFluxes_H; - out[1] = t1*LeftFluxes_U; - out[2] = t1*LeftFluxes_V; - } else if ((sL < 0.0f) && (sStar >= 0.0f)){ - out[0] = FStar[0]; - FStar[2] = FStar[0] * uLp; - out[1] = FStar[1]*edgeNormals[0] - FStar[2]*edgeNormals[1]; - out[2] = FStar[1]*edgeNormals[1] + FStar[2]*edgeNormals[0]; - } else if((sStar < 0.0f) && (sR >= 0.0f)){ - out[0] = FStar[0]; - FStar[2] = FStar[0] * uRp; - out[1] = FStar[1]*edgeNormals[0] - FStar[2]*edgeNormals[1]; - out[2] = FStar[1]*edgeNormals[1] + FStar[2]*edgeNormals[0]; - } else { - out[0] = t2*RightFluxes_H; - out[1] = t2*RightFluxes_U; - out[2] = t2*RightFluxes_V; - } + // float sLMinus = sL < 0.0f ? sL : 0.0f; + // float sRPlus = sR > 0.0f ? sR : 0.0f; + // float sRMinussL = sRPlus - sLMinus; + // sRMinussL = sRMinussL < EPS ? EPS : sRMinussL; + // float t1 = sRPlus / sRMinussL; + // float t2 = ( -1.0 * sLMinus ) / sRMinussL; + // float t3 = ( sRPlus * sLMinus ) / sRMinussL; + // float FStar[3]; + // FStar[0] = + // ( t1 * LeftFluxes_H ) + + // ( t2 * RightFluxes_H ) + + // ( t3 * ( hR - hL ) ); + // + // + // FStar[1] = + // ( t1 * LeftFluxes_N ) + + // ( t2 * RightFluxes_N ) + + // ( t3 * ( (hR * uRn) - + // (hL * uLn) ) ); + // + // if( sL >= 0.0f) { + // out[0] = t1*LeftFluxes_H; + // out[1] = t1*LeftFluxes_U; + // out[2] = t1*LeftFluxes_V; + // } else if ((sL < 0.0f) && (sStar >= 0.0f)){ + // out[0] = FStar[0]; + // FStar[2] = FStar[0] * uLp; + // out[1] = FStar[1]*edgeNormals[0] - FStar[2]*edgeNormals[1]; + // out[2] = FStar[1]*edgeNormals[1] + FStar[2]*edgeNormals[0]; + // } else if((sStar < 0.0f) && (sR >= 0.0f)){ + // out[0] = FStar[0]; + // FStar[2] = FStar[0] * uRp; + // out[1] = FStar[1]*edgeNormals[0] - FStar[2]*edgeNormals[1]; + // out[2] = FStar[1]*edgeNormals[1] + FStar[2]*edgeNormals[0]; + // } else { + // out[0] = t2*RightFluxes_H; + // out[1] = t2*RightFluxes_U; + // out[2] = t2*RightFluxes_V; + // } out[0] *= *edgeLength; out[1] *= *edgeLength; out[2] *= *edgeLength; - // op_printf("Length %f \n", *edgeLength); + float maximum = fabs(uLn + cL); maximum = maximum > fabs(uLn - cL) ? maximum : fabs(uLn - cL); maximum = maximum > fabs(uRn + cR) ? maximum : fabs(uRn + cR); diff --git a/volna_init/mesh.hpp b/volna_init/mesh.hpp index 066e760..bc442da 100755 --- a/volna_init/mesh.hpp +++ b/volna_init/mesh.hpp @@ -18,6 +18,7 @@ #include "meshObjects.hpp" #include "meshIo.hpp" #include "mathParser.hpp" +namespace bg = boost::geometry; inline bool PairCompare( const std::pair &elem1, const std::pair &elem2 ) @@ -63,7 +64,7 @@ class Mesh { void WriteMeshBandwith( std::ofstream & ); void ComputeConnectivity(); void LegacyInterface(); - void ComputeGeometricQuantities(); + void ComputeGeometricQuantities(unsigned int&); // void ComputeGradientInterpolator(); }; @@ -408,14 +409,14 @@ void Mesh::RCMRenumbering() { if ( neighbors.at(j) != -1 ) trueNeighbors.push_back( Cells.at(j) ); - + } - + for ( int j = 0; j < 3; ++j ) { int neighbor = neighbors.at( 2 - j ); - + if ( neighbor != -1 && parent[neighbor] == -1 ) { // neighbor is unvisited @@ -612,7 +613,7 @@ void Mesh::LegacyInterface() { // detect boundary Triangles if ( Cells.at(i).neighbors()[2] == -1 ) { - + Cells.at(i).boundary = true; Cells.at(i).boundary_type = -1; @@ -675,10 +676,14 @@ void Mesh::LegacyInterface() { // // } -void Mesh::ComputeGeometricQuantities() { +void Mesh::ComputeGeometricQuantities(unsigned int& spherical) { std::cerr << "Computing mesh geometric quantities... "; - + if (spherical){ + std::cerr << "Spherical coordinate system" << std::endl; + } else { + std::cerr << "Cartesian coordinate system" << std::endl; + } CellCenters = GeomValues( NVolumes ); CellVolumes = ScalarValue::Zero( NVolumes ); #pragma omp parallel for @@ -696,19 +701,37 @@ void Mesh::ComputeGeometricQuantities() { CellCenters.y( i ) = center( 1 ); CellCenters.z( i ) = center( 2 ); - // Cell volumes + Point p1 = Nodes[ Cells.at(i).vertices()[0] ]; Point p2 = Nodes[ Cells.at(i).vertices()[1] ]; Point p3 = Nodes[ Cells.at(i).vertices()[2] ]; - - RealType volume = 0.5 * std::abs( orient2d( p1, p2, p3 ) ); - + RealType volume; + + if (spherical){ + // Area is computed using boost libraries + typedef bg::model::point + > spherical_point; + // > spherical_point; + spherical_point vert1(p1.x(), p1.y()); + spherical_point vert2(p2.x(), p2.y()); + spherical_point vert3(p3.x(), p3.y()); + bg::model::polygon< spherical_point > sph_poly; + bg::append( sph_poly, vert1); + bg::append( sph_poly, vert2); + bg::append( sph_poly, vert3); + bg::append( sph_poly, vert1); + bg::correct(sph_poly); + double const earth_radius = 6378136.6; // miles + bg::strategy::area::spherical< spherical_point > spherical_earth(earth_radius); + volume = bg::area(sph_poly,spherical_earth); + } else { + volume = 0.5 * std::abs( orient2d( p1, p2, p3 ) ); + } CellVolumes( i ) = volume; } - - FacetCenters = GeomValues::GeomValues( NFaces ); - FacetNormals = GeomValues::GeomValues( NFaces ); + FacetCenters = GeomValues( NFaces ); + FacetNormals = GeomValues( NFaces ); FacetVolumes = ScalarValue::Zero( NFaces ); #pragma omp parallel for for ( int i = 0; i < NFaces; ++i ) { @@ -728,17 +751,40 @@ void Mesh::ComputeGeometricQuantities() { // Facet volume (e.g. length) Point p1 = Nodes[ Facets.at(i).vertices()[0] ]; Point p2 = Nodes[ Facets.at(i).vertices()[1] ]; - - RealType volume = (p2 - p1).norm(); - + RealType volume; + + if (spherical){ + // Length is computed using n-vector formulation (spherical assumption) + // double const earth_radius = 6378136.6; + // double rad = M_PI/180.0; + // double test_length = earth_radius*acos(cos(p1.x()*rad)*cos(p1.y()*rad)*cos(p2.x()*rad)*cos(p2.y()*rad) + + // cos(p1.y()*rad)*sin(p1.x()*rad)*cos(p2.y()*rad)*sin(p2.x()*rad) + + // sin(p1.y()*rad)*sin(p2.y()*rad)); + // Length is computed using boost libraries (geoid assumption) + typedef bg::model::point + > spherical_point; + spherical_point vert1(p1.x(), p1.y()); + spherical_point vert2(p2.x(), p2.y()); + double const earth_radius = 6378136.6; // meters + volume = bg::distance(vert1, vert2)*earth_radius; + } else { + volume = (p2 - p1).norm(); + } FacetVolumes( i ) = volume; // Facet normal Vector normal = Vector::UnitX(); normal.x() = - ( p2 - p1 ).y(); normal.y() = ( p2 - p1).x(); - normal.normalize(); - + // normal.normalize(); + if (spherical){ + double rad = M_PI/180.0; + double sigma = sqrt((normal.x()/cos(FacetCenters.y(i)*rad))*(normal.x()/cos(FacetCenters.y(i)*rad)) + (normal.y()*normal.y())); + normal.x() = normal.x()/(sigma*cos(FacetCenters.y(i)*rad)); + normal.y() = normal.y()/(sigma); + } else { + normal.normalize(); + } // Flip the normal if needed. // RealType // ScalarProduct = ( FacetCenters.col(i) - From 1a93fa5856e7445214d587fca380b11da9ba8b72 Mon Sep 17 00:00:00 2001 From: DanGiles Date: Mon, 15 Nov 2021 12:48:01 +0000 Subject: [PATCH 2/4] Removal of EvolveValuesRK3* and support for a100 GPUs --- sp/Makefile | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/sp/Makefile b/sp/Makefile index d10997e..200f08a 100755 --- a/sp/Makefile +++ b/sp/Makefile @@ -42,7 +42,7 @@ ifdef PTSCOTCH_INSTALL_PATH PTSCOTCH_INC = -I$(PTSCOTCH_INSTALL_PATH)/include -DHAVE_PTSCOTCH PTSCOTCH_INC = -I$(PTSCOTCH_INSTALL_PATH)/include PTSCOTCH_LIB = -L$(PTSCOTCH_INSTALL_PATH)/lib/ -lptscotch \ - -L$(PTSCOTCH_INSTALL_PATH)/lib/ -lptscotcherr + -L$(PTSCOTCH_INSTALL_PATH)/lib/ -lscotch -lptscotcherr endif ifeq ($(OP2_COMPILER),gnu) @@ -56,7 +56,7 @@ else ifeq ($(OP2_COMPILER),intel) CPP = icpc # CPPFLAGS = -g -O0 - CPPFLAGS = -O3 -xHost -parallel -g -DMPICH_IGNORE_CXX_SEEK -DMPICH_SKIP_MPICXX + CPPFLAGS = -O3 -parallel -g -DMPICH_IGNORE_CXX_SEEK -DMPICH_SKIP_MPICXX # CPPFLAGS = -g -O0 -vec-report -xSSE4.2 -parallel OMPFLAGS = -qopenmp MPICPP = $(MPI_INSTALL_PATH)/bin/mpicxx @@ -75,7 +75,7 @@ endif # -NVCCFLAGS = -arch=sm_60 -Xptxas=-v -O2 -m64 -g #-G +NVCCFLAGS = -arch=sm_80 -Xptxas=-v -O2 -m64 -g #-G # # master to make all versions @@ -114,16 +114,15 @@ else endif -cuda/volna_kernels_cu.o: cuda/volna_kernels.cu \ - EvolveValuesRK2_1.h EvolveValuesRK2_2.h EvolveValuesRK3_1.h EvolveValuesRK3_2.h EvolveValuesRK3_3.h EvolveValuesRK3_4.h applyConst.h getMaxElevation.h getTotalVol.h \ +cuda/volna_kernels_cu.o: cuda/volna_kernels.cu \ + EvolveValuesRK2_1.h EvolveValuesRK2_2.h applyConst.h getMaxElevation.h getMaxSpeed.h \ initBathymetry_formula.h initBathymetry_update.h initBore_select.h initEta_formula.h initGaussianLandslide.h \ - initU_formula.h initV_formula.h computeGradient.h computeFluxes.h limiter.h SpaceDiscretization.h NumericalFluxes.h simulation_1.h \ - values_operation2.h cuda/applyConst_kernel.cu cuda/EvolveValuesRK2_1_kernel.cu cuda/EvolveValuesRK2_2_kernel.cu cuda/EvolveValuesRK3_1_kernel.cu\ - cuda/EvolveValuesRK3_2_kernel.cu cuda/EvolveValuesRK3_3_kernel.cu cuda/EvolveValuesRK3_4_kernel.cu cuda/computeGradient_kernel.cu \ - cuda/computeFluxes_kernel.cu cuda/limiter_kernel.cu cuda/SpaceDiscretization_kernel.cu cuda/NumericalFluxes_kernel.cu \ - cuda/simulation_1_kernel.cu \ + initU_formula.h initV_formula.h computeGradient.h computeFluxes.h limiter.h Timestep.h \ + NumericalFluxes.h simulation_1.h values_operation2.h cuda/applyConst_kernel.cu \ + cuda/EvolveValuesRK2_1_kernel.cu cuda/EvolveValuesRK2_2_kernel.cu cuda/computeFluxes_kernel.cu \ + cuda/limiter_kernel.cu cuda/Timestep_kernel.cu cuda/NumericalFluxes_kernel.cu \ + cuda/simulation_1_kernel.cu cuda/computeGradient_kernel.cu cuda/getMaxSpeed_kernel.cu cuda/getMaxElevation_kernel.cu \ cuda/values_operation2_kernel.cu Makefile - nvcc $(VAR) $(INC) $(NVCCFLAGS) $(OP2_INC) $(HDF5_INC) -I$(MPI_INC) -c -o cuda/volna_kernels_cu.o cuda/volna_kernels.cu volna_mpi: volna.cpp volna_event.cpp volna_init.cpp volna_output.cpp volna_writeVTK.cpp volna_simulation.cpp volna_util.cpp Makefile From dd6f10a83e9abe6574518f7850056c48fbb9dfb1 Mon Sep 17 00:00:00 2001 From: DanGiles Date: Mon, 15 Nov 2021 13:00:37 +0000 Subject: [PATCH 3/4] Removal of EvolveValuesRK3* from Makefile, support for A100 GPUs and Regenerated cuda files --- sp/Friction_manning.h | 4 +- sp/Makefile | 22 +- sp/NumericalFluxes_sph.h | 17 +- sp/computeFluxes.h | 17 +- sp/computeFluxes_sph.h | 222 +++---- sp/cuda/EvolveValuesRK2_1_kernel.cu | 126 +--- sp/cuda/EvolveValuesRK2_2_kernel.cu | 105 +--- sp/cuda/Friction_manning_kernel.cu | 125 ++++ sp/cuda/NumericalFluxes_kernel.cu | 246 ++++---- sp/cuda/Timestep_kernel.cu | 218 +++++++ sp/cuda/applyConst_kernel.cu | 30 +- sp/cuda/computeFluxes_kernel.cu | 253 +++++--- sp/cuda/computeGradient_kernel.cu | 160 +++-- sp/cuda/gatherLocations_kernel.cu | 51 +- sp/cuda/getMaxElevation_kernel.cu | 29 +- sp/cuda/getMaxSpeed_kernel.cu | 27 +- sp/cuda/getTotalVol_kernel.cu | 10 +- sp/cuda/incConst_kernel.cu | 34 +- sp/cuda/initBathyRelative_formula_kernel.cu | 34 +- sp/cuda/initBathymetry_formula_kernel.cu | 32 +- sp/cuda/initBathymetry_large_kernel.cu | 84 ++- sp/cuda/initBathymetry_update_kernel.cu | 82 ++- sp/cuda/initBore_select_kernel.cu | 26 +- sp/cuda/initEta_formula_kernel.cu | 29 +- sp/cuda/initGaussianLandslide_kernel.cu | 26 +- sp/cuda/initU_formula_kernel.cu | 27 +- sp/cuda/initV_formula_kernel.cu | 28 +- sp/cuda/limiter_kernel.cu | 67 +- sp/cuda/simulation_1_kernel.cu | 26 +- sp/cuda/values_operation2_kernel.cu | 24 +- sp/cuda/volna_hybkernels.cu | 644 ++++++++++++++------ sp/cuda/volna_kernels.cu | 42 +- sp/volna_op.cpp | 52 +- sp/volna_simulation_op.cpp | 113 +++- 34 files changed, 1935 insertions(+), 1097 deletions(-) create mode 100644 sp/cuda/Friction_manning_kernel.cu create mode 100644 sp/cuda/Timestep_kernel.cu diff --git a/sp/Friction_manning.h b/sp/Friction_manning.h index 2652b19..0fa4e4b 100644 --- a/sp/Friction_manning.h +++ b/sp/Friction_manning.h @@ -14,10 +14,10 @@ inline void Friction_manning(const float *dT,const float *M_n, //OP_RW, discard // Update Momentum values[0] = TruncatedH; values[3] = values[3]; - if (values[0] <= 1e-3){ + if (values[0] <= EPS){ values[1] = 0.0f; values[2] = 0.0f; - } else if (1e-3 < values[0] <= 50.0f) { + } else if (EPS < values[0] <= 50.0f) { values[1] = values[1] / (1.0f + Fr * *dT); values[2] = values[2] / (1.0f + Fr * *dT); } else { diff --git a/sp/Makefile b/sp/Makefile index d10997e..101973a 100755 --- a/sp/Makefile +++ b/sp/Makefile @@ -42,7 +42,7 @@ ifdef PTSCOTCH_INSTALL_PATH PTSCOTCH_INC = -I$(PTSCOTCH_INSTALL_PATH)/include -DHAVE_PTSCOTCH PTSCOTCH_INC = -I$(PTSCOTCH_INSTALL_PATH)/include PTSCOTCH_LIB = -L$(PTSCOTCH_INSTALL_PATH)/lib/ -lptscotch \ - -L$(PTSCOTCH_INSTALL_PATH)/lib/ -lptscotcherr + -L$(PTSCOTCH_INSTALL_PATH)/lib/ -lscotch -lptscotcherr endif ifeq ($(OP2_COMPILER),gnu) @@ -56,7 +56,7 @@ else ifeq ($(OP2_COMPILER),intel) CPP = icpc # CPPFLAGS = -g -O0 - CPPFLAGS = -O3 -xHost -parallel -g -DMPICH_IGNORE_CXX_SEEK -DMPICH_SKIP_MPICXX + CPPFLAGS = -O3 -parallel -g -DMPICH_IGNORE_CXX_SEEK -DMPICH_SKIP_MPICXX # CPPFLAGS = -g -O0 -vec-report -xSSE4.2 -parallel OMPFLAGS = -qopenmp MPICPP = $(MPI_INSTALL_PATH)/bin/mpicxx @@ -75,7 +75,7 @@ endif # -NVCCFLAGS = -arch=sm_60 -Xptxas=-v -O2 -m64 -g #-G +NVCCFLAGS = -arch=sm_80 -Xptxas=-v -O2 -m64 -g #-G # # master to make all versions @@ -114,16 +114,14 @@ else endif -cuda/volna_kernels_cu.o: cuda/volna_kernels.cu \ - EvolveValuesRK2_1.h EvolveValuesRK2_2.h EvolveValuesRK3_1.h EvolveValuesRK3_2.h EvolveValuesRK3_3.h EvolveValuesRK3_4.h applyConst.h getMaxElevation.h getTotalVol.h \ +cuda/volna_kernels_cu.o: cuda/volna_kernels.cu \ + EvolveValuesRK2_1.h EvolveValuesRK2_2.h applyConst.h getMaxElevation.h getMaxSpeed.h \ initBathymetry_formula.h initBathymetry_update.h initBore_select.h initEta_formula.h initGaussianLandslide.h \ - initU_formula.h initV_formula.h computeGradient.h computeFluxes.h limiter.h SpaceDiscretization.h NumericalFluxes.h simulation_1.h \ - values_operation2.h cuda/applyConst_kernel.cu cuda/EvolveValuesRK2_1_kernel.cu cuda/EvolveValuesRK2_2_kernel.cu cuda/EvolveValuesRK3_1_kernel.cu\ - cuda/EvolveValuesRK3_2_kernel.cu cuda/EvolveValuesRK3_3_kernel.cu cuda/EvolveValuesRK3_4_kernel.cu cuda/computeGradient_kernel.cu \ - cuda/computeFluxes_kernel.cu cuda/limiter_kernel.cu cuda/SpaceDiscretization_kernel.cu cuda/NumericalFluxes_kernel.cu \ - cuda/simulation_1_kernel.cu \ - cuda/values_operation2_kernel.cu Makefile - + initU_formula.h initV_formula.h computeGradient.h computeFluxes.h limiter.h Timestep.h NumericalFluxes.h simulation_1.h \ + values_operation2.h Friction_manning.h cuda/applyConst_kernel.cu cuda/EvolveValuesRK2_1_kernel.cu cuda/EvolveValuesRK2_2_kernel.cu \ + cuda/computeFluxes_kernel.cu cuda/limiter_kernel.cu cuda/Timestep_kernel.cu cuda/NumericalFluxes_kernel.cu \ + cuda/simulation_1_kernel.cu cuda/computeGradient_kernel.cu cuda/getMaxSpeed_kernel.cu cuda/getMaxElevation_kernel.cu \ + cuda/values_operation2_kernel.cu cuda/Friction_manning_kernel.cu Makefile nvcc $(VAR) $(INC) $(NVCCFLAGS) $(OP2_INC) $(HDF5_INC) -I$(MPI_INC) -c -o cuda/volna_kernels_cu.o cuda/volna_kernels.cu volna_mpi: volna.cpp volna_event.cpp volna_init.cpp volna_output.cpp volna_writeVTK.cpp volna_simulation.cpp volna_util.cpp Makefile diff --git a/sp/NumericalFluxes_sph.h b/sp/NumericalFluxes_sph.h index 58946e5..beee7f0 100644 --- a/sp/NumericalFluxes_sph.h +++ b/sp/NumericalFluxes_sph.h @@ -1,26 +1,25 @@ inline void NumericalFluxes_sph(float *left, //OP_INC float *right, //OP_INC const float *leftcellCenters, const float *rightcellCenters, - // const float *edgeCenters, const float *edgeFluxes, //OP_READ const float *bathySource, //OP_READ const float *edgeNormals, const int *isRightBoundary, const float *cellVolumes0, //OP_READ const float *cellVolumes1) //OP_READ) { - left[0] -= (edgeFluxes[0])/(Radius*cellVolumes0[0]*cos(M_PI*leftcellCenters[1]/180.0)); + left[0] -= (edgeFluxes[0])/(Radius*cellVolumes0[0]); left[1] -= (edgeFluxes[1] + bathySource[0] * edgeNormals[0])/(Radius*cellVolumes0[0]*cos(M_PI*leftcellCenters[1]/180.0)); - left[2] -= (edgeFluxes[2] + bathySource[0] * edgeNormals[1])/(Radius*cellVolumes0[0]*cos(M_PI*leftcellCenters[1]/180.0)); + left[2] -= (edgeFluxes[2] + bathySource[0] * edgeNormals[1])/(Radius*cellVolumes0[0]); // Added centered Source Term - // left[1] += (bathySource[2] *edgeNormals[0])/(Radius*cellVolumes0[0]*cos(M_PI*46.0/180.0));//*cos(M_PI*leftcellCenters[1]/180.0)); - left[2] += (bathySource[2] *edgeNormals[1])/(Radius*cellVolumes0[0]*cos(M_PI*leftcellCenters[1]/180.0)); + left[1] += (bathySource[2] *edgeNormals[0])/(Radius*cellVolumes0[0]*cos(M_PI*leftcellCenters[1]/180.0)); + left[2] += (bathySource[2] *edgeNormals[1])/(Radius*cellVolumes0[0]); if (!*isRightBoundary) { - right[0] += (edgeFluxes[0])/(Radius*cellVolumes1[0]*cos(M_PI*rightcellCenters[1]/180.0)); + right[0] += edgeFluxes[0]/cellVolumes1[0]; right[1] += (edgeFluxes[1] + bathySource[1] * edgeNormals[0])/(Radius*cellVolumes1[0]*cos(M_PI*rightcellCenters[1]/180.0)); - right[2] += (edgeFluxes[2] + bathySource[1] * edgeNormals[1])/(Radius*cellVolumes1[0]*cos(M_PI*rightcellCenters[1]/180.0)); + right[2] += (edgeFluxes[2] + bathySource[1] * edgeNormals[1])/(Radius*cellVolumes1[0]); // Added centered Source Term - // right[1] -= (bathySource[3] *edgeNormals[0])/(Radius*cellVolumes1[0]*cos(M_PI*46.0/180.0));//*cos(M_PI*rightcellCenters[1]/180.0)); - right[2] -= (bathySource[3] *edgeNormals[1])/(Radius*cellVolumes1[0]*cos(M_PI*rightcellCenters[1]/180.0)); + right[1] -= (bathySource[3] *edgeNormals[0])/(Radius*cellVolumes1[0]*cos(M_PI*rightcellCenters[1]/180.0)); + right[2] -= (bathySource[3] *edgeNormals[1])/(Radius*cellVolumes1[0]); } else { right[0] += 0.0f; right[1] += 0.0f; diff --git a/sp/computeFluxes.h b/sp/computeFluxes.h index 748746c..e9cf409 100644 --- a/sp/computeFluxes.h +++ b/sp/computeFluxes.h @@ -29,7 +29,7 @@ inline void computeFluxes(const float *cellLeft, const float *cellRight, leftCellValues[1] += alphaleft[0] * ((dxl * leftGradient[2])+(dyl * leftGradient[3])); leftCellValues[2] += alphaleft[0] * ((dxl * leftGradient[4])+(dyl * leftGradient[5])); leftCellValues[3] += alphaleft[0] * ((dxl * leftGradient[6])+(dyl * leftGradient[7])); - if (leftCellValues[0] >= EPS){ + if (leftCellValues[0] >= 1e-3){ uL = leftCellValues[1]/leftCellValues[0]; vL = leftCellValues[2]/leftCellValues[0]; } else { @@ -49,7 +49,7 @@ inline void computeFluxes(const float *cellLeft, const float *cellRight, rightCellValues[1] += alpharight[0] * ((dxr * rightGradient[2])+(dyr * rightGradient[3])); rightCellValues[2] += alpharight[0] * ((dxr * rightGradient[4])+(dyr * rightGradient[5])); rightCellValues[3] += alpharight[0] * ((dxr * rightGradient[6])+(dyr * rightGradient[7])); - if (rightCellValues[0] >= EPS){ + if (rightCellValues[0] >= 1e-3){ uR = rightCellValues[1]/rightCellValues[0]; vR = rightCellValues[2]/rightCellValues[0]; } else { @@ -77,9 +77,9 @@ inline void computeFluxes(const float *cellLeft, const float *cellRight, rightCellValues[3] = leftCellValues[3]; outNormalVelocity = inNormalVelocity; } - /* + // Wall - rightCellValues[3] = leftCellValues[3]; + /*rightCellValues[3] = leftCellValues[3]; rightCellValues[0] = leftCellValues[0]; outNormalVelocity = -1.0f*inNormalVelocity; outTangentVelocity = inTangentVelocity; @@ -135,12 +135,14 @@ inline void computeFluxes(const float *cellLeft, const float *cellRight, sStar = (sL*hR*(uRn - sR) - sR*hL*(uLn - sL))/ (hR*(uRn - sR) - hL*(uLn - sL)); - if ((hL <= EPS) && (hR > EPS)) { + if ((leftCellValues[0] <= EPS) && (rightCellValues[0] > EPS)) { + // if ((leftCellValues[0] <= 1e-3) && (rightCellValues[0] > 1e-3)) { sL = uRn - 2.0f*cR; sR = uRn + cR; sStar = sL; } - if ((hR <= EPS) && (hL > EPS)) { + if ((rightCellValues[0] <= EPS) && (leftCellValues[0] > EPS)) { + // if ((rightCellValues[0] <= 1e-3) && (leftCellValues[0] > 1e-3)) { sR = uLn + 2.0f*cL; sL = uLn - cL; sStar = sR; @@ -151,6 +153,7 @@ inline void computeFluxes(const float *cellLeft, const float *cellRight, float uRp = vR*edgeNormals[0] - uR*edgeNormals[1]; float LeftFluxes_H, LeftFluxes_N, LeftFluxes_U, LeftFluxes_V; + //inlined ProjectedPhysicalFluxes(leftCellValues, Normals, params, LeftFluxes); float HuDotN = (hL*uL) * edgeNormals[0] + (hL*vL) * edgeNormals[1]; LeftFluxes_H = HuDotN; @@ -162,8 +165,10 @@ inline void computeFluxes(const float *cellLeft, const float *cellRight, LeftFluxes_U += (.5f * g * edgeNormals[0] ) * ( hL * hL ); LeftFluxes_V += (.5f * g * edgeNormals[1] ) * ( hL * hL ); LeftFluxes_N += (.5f * g ) * ( hL * hL ); + //end of inlined float RightFluxes_H,RightFluxes_N, RightFluxes_U, RightFluxes_V; + //inlined ProjectedPhysicalFluxes(rightCellValues, Normals, params, RightFluxes); HuDotN = (hR*uR) * edgeNormals[0] + (hR*vR) * edgeNormals[1]; RightFluxes_H = HuDotN; diff --git a/sp/computeFluxes_sph.h b/sp/computeFluxes_sph.h index f5133be..13c07dd 100644 --- a/sp/computeFluxes_sph.h +++ b/sp/computeFluxes_sph.h @@ -19,18 +19,17 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight, leftCellValues[2] = cellLeft[2]; leftCellValues[3] = cellLeft[3]; float dxl, dyl, dxr, dyr; - // Convert to arc length - dxl = (edgeCenters[0] - leftcellCenters[0])*Radius; - dyl = (edgeCenters[1] - leftcellCenters[1])*Radius; - dxr = (edgeCenters[0] - rightcellCenters[0])*Radius; - dyr = (edgeCenters[1] - rightcellCenters[1])*Radius; + dxl = (edgeCenters[0] - leftcellCenters[0]); + dyl = (edgeCenters[1] - leftcellCenters[1]); + dxr = (edgeCenters[0] - rightcellCenters[0]); + dyr = (edgeCenters[1] - rightcellCenters[1]); // Second order Reconstruction - // leftCellValues[0] += alphaleft[0] * ((dxl * leftGradient[0])+(dyl * leftGradient[1])); - // leftCellValues[1] += alphaleft[0] * ((dxl * leftGradient[2])+(dyl * leftGradient[3])); - // leftCellValues[2] += alphaleft[0] * ((dxl * leftGradient[4])+(dyl * leftGradient[5])); - // leftCellValues[3] += alphaleft[0] * ((dxl * leftGradient[6])+(dyl * leftGradient[7])); - if (leftCellValues[0] >= EPS){ + leftCellValues[0] += alphaleft[0] * ((dxl * leftGradient[0])+(dyl * leftGradient[1])); + leftCellValues[1] += alphaleft[0] * ((dxl * leftGradient[2])+(dyl * leftGradient[3])); + leftCellValues[2] += alphaleft[0] * ((dxl * leftGradient[4])+(dyl * leftGradient[5])); + leftCellValues[3] += alphaleft[0] * ((dxl * leftGradient[6])+(dyl * leftGradient[7])); + if (leftCellValues[0] >= 1e-3){ uL = leftCellValues[1]/leftCellValues[0]; vL = leftCellValues[2]/leftCellValues[0]; } else { @@ -38,7 +37,6 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight, vL = 0.0f; } zL = cellLeft[3] - cellLeft[0]; - zR = cellRight[3] - cellRight[0]; if (!*isRightBoundary) { rightCellValues[0] = cellRight[0]; @@ -47,11 +45,11 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight, rightCellValues[3] = cellRight[3]; // Second order Reconstruction - // rightCellValues[0] += alpharight[0] * ((dxr * rightGradient[0])+(dyr * rightGradient[1])); - // rightCellValues[1] += alpharight[0] * ((dxr * rightGradient[2])+(dyr * rightGradient[3])); - // rightCellValues[2] += alpharight[0] * ((dxr * rightGradient[4])+(dyr * rightGradient[5])); - // rightCellValues[3] += alpharight[0] * ((dxr * rightGradient[6])+(dyr * rightGradient[7])); - if (rightCellValues[0] >= EPS){ + rightCellValues[0] += alpharight[0] * ((dxr * rightGradient[0])+(dyr * rightGradient[1])); + rightCellValues[1] += alpharight[0] * ((dxr * rightGradient[2])+(dyr * rightGradient[3])); + rightCellValues[2] += alpharight[0] * ((dxr * rightGradient[4])+(dyr * rightGradient[5])); + rightCellValues[3] += alpharight[0] * ((dxr * rightGradient[6])+(dyr * rightGradient[7])); + if (rightCellValues[0] >= 1e-3){ uR = rightCellValues[1]/rightCellValues[0]; vR = rightCellValues[2]/rightCellValues[0]; } else { @@ -59,40 +57,37 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight, vR = 0.0f; } } else { - float nx = edgeNormals[0]; - float ny = edgeNormals[1]; - float inNormalVelocity = uL * nx + vL * ny; - float inTangentVelocity = -1.0f * uL * ny + vL * nx; - float outNormalVelocity = 0.0f; - float outTangentVelocity = 0.0f; + float nx = edgeNormals[0]; + float ny = edgeNormals[1]; + float inNormalVelocity = uL * nx + vL * ny; + float inTangentVelocity = -1.0f * uL * ny + vL * nx; + float outNormalVelocity = 0.0f; + float outTangentVelocity = 0.0f; + //rightCellValues[3] = leftCellValues[3]; + // Outflow + // float wet = (fabs(*zmin) - zL) > 0.0f ? (fabs(*zmin) - zL) : EPS; + // rightCellValues[3] = zL + wet; + // rightCellValues[0] = wet; + rightCellValues[3] = leftCellValues[3]; + rightCellValues[0] = leftCellValues[0]; + outNormalVelocity = inNormalVelocity; + outTangentVelocity = inTangentVelocity; + uR = outNormalVelocity * nx - outTangentVelocity * ny; + vR = outNormalVelocity * ny + outTangentVelocity * nx; - // Outflow - float wet = (fabs(*zmin) - zL) > 0.0f ? (fabs(*zmin) - zL) : EPS; - float critical = sqrt(uL*uL + vL*vL); - outTangentVelocity = inTangentVelocity; - if (critical < sqrt(g*leftCellValues[0])){ - rightCellValues[0] = wet; - rightCellValues[3] = wet + zL; - outNormalVelocity = 0.0f; - } else { - rightCellValues[0] = leftCellValues[0]; - rightCellValues[3] = leftCellValues[3]; - outNormalVelocity = inNormalVelocity; - } - /* - // Wall - rightCellValues[3] = leftCellValues[3]; - rightCellValues[0] = leftCellValues[0]; - outNormalVelocity = -1.0f*inNormalVelocity; - outTangentVelocity = inTangentVelocity; - */ - uR = outNormalVelocity * nx - outTangentVelocity * ny; - vR = outNormalVelocity * ny + outTangentVelocity * nx; - rightCellValues[1] = uR*rightCellValues[0]; - rightCellValues[2] = vR*rightCellValues[0]; - zR = zL; - } + // Wall + // rightCellValues[3] = leftCellValues[3]; + // rightCellValues[0] = leftCellValues[0]; + // outNormalVelocity = -1.0f*inNormalVelocity; + // outTangentVelocity = inTangentVelocity; + // uR = outNormalVelocity * nx - outTangentVelocity * ny; + // vR = outNormalVelocity * ny + outTangentVelocity * nx; + + rightCellValues[1] = uR*rightCellValues[0]; + rightCellValues[2] = vR*rightCellValues[0]; + } + zR = cellRight[3] - cellRight[0]; rightCellValues[3] -= rightCellValues[0]; leftCellValues[3] -= leftCellValues[0]; // ------------------------------------------------------------------------------------ @@ -109,11 +104,10 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight, hR = hR > 0.0f ? hR : 0.0f; bathySource[0] -= .5f * g * (hL * hL); bathySource[1] -= .5f * g * (hR * hR); - // Audusse Reconstruction(2005) 2nd order centered term - // bathySource[2] = -.5f * g *(leftCellValues[0] + cellLeft[0])*(leftCellValues[3] - zL); - // bathySource[3] = -.5f * g *(rightCellValues[0] + cellRight[0])*(rightCellValues[3] - zR); - bathySource[2] = .5f * g * (hL * hL)*sin(M_PI*leftcellCenters[1]/180.0); - bathySource[3] = .5f * g * (hR * hR)*sin(M_PI*rightcellCenters[1]/180.0); + // Audusse Reconstruction(2005) 2nd order Centered term + bathySource[2] = -.5f * g *(leftCellValues[0] + cellLeft[0])*(leftCellValues[3] - zL); + bathySource[3] = -.5f * g *(rightCellValues[0] + cellRight[0])*(rightCellValues[3] - zR); + bathySource[0] *= *edgeLength; bathySource[1] *= *edgeLength; bathySource[2] *= *edgeLength; @@ -140,12 +134,14 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight, sStar = (sL*hR*(uRn - sR) - sR*hL*(uLn - sL))/ (hR*(uRn - sR) - hL*(uLn - sL)); - if ((hL <= EPS) && (hR > EPS)) { + //if ((leftCellValues[0] <= EPS) && (rightCellValues[0] > EPS)) { + if ((leftCellValues[0] <= 1e-3) && (rightCellValues[0] > 1e-3)) { sL = uRn - 2.0f*cR; sR = uRn + cR; sStar = sL; } - if ((hR <= EPS) && (hL > EPS)) { + //if ((rightCellValues[0] <= EPS) && (leftCellValues[0] > EPS)) { + if ((rightCellValues[0] <= 1e-3) && (leftCellValues[0] > 1e-3)) { sR = uLn + 2.0f*cL; sL = uLn - cL; sStar = sR; @@ -156,29 +152,35 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight, float uRp = vR*edgeNormals[0] - uR*edgeNormals[1]; float LeftFluxes_H, LeftFluxes_U, LeftFluxes_V, LeftFluxes_N; - // float HuDotN = (hL*uL) * edgeNormals[0] + (hL*vL) * edgeNormals[1]; - float HuDotN = (hL*uL) * edgeNormals[0] + (hL*vL*cos(M_PI*edgeCenters[1]/180.0)) * edgeNormals[1]; + //inlined ProjectedPhysicalFluxes(leftCellValues, Normals, params, LeftFluxes); + float HuDotN = (leftCellValues[1]/cos(M_PI*leftcellCenters[1]/180.0)) * edgeNormals[0] + + (leftCellValues[2]) * edgeNormals[1]; + LeftFluxes_H = HuDotN; LeftFluxes_U = HuDotN * uL; LeftFluxes_V = HuDotN * vL; // Normal Momentum flux term LeftFluxes_N = HuDotN * uLn; - LeftFluxes_U += (.5f * g * edgeNormals[0] ) * ( hL * hL); - LeftFluxes_V += (.5f * g * edgeNormals[1] ) * ( hL * hL* cos(M_PI*edgeCenters[1]/180.0)); + LeftFluxes_U += ((.5f * g * edgeNormals[0] ) * ( hL * hL ))/(cos(M_PI*leftcellCenters[1]/180.0)); + LeftFluxes_V += (.5f * g * edgeNormals[1] ) * ( hL * hL ); LeftFluxes_N += (.5f * g ) * ( hL * hL ); + //end of inlined float RightFluxes_H, RightFluxes_U, RightFluxes_V, RightFluxes_N; - HuDotN = (hR*uR) * edgeNormals[0] + (hR*vR*cos(M_PI*edgeCenters[1]/180.0)) * edgeNormals[1]; - // HuDotN = (hR*uR) * edgeNormals[0] + (hR*vR) * edgeNormals[1]; + //inlined ProjectedPhysicalFluxes(rightCellValues, Normals, params, RightFluxes); + HuDotN = (rightCellValues[1]/cos(M_PI*rightcellCenters[1]/180.0)) * edgeNormals[0] + + (rightCellValues[2]) * edgeNormals[1]; + RightFluxes_H = HuDotN; RightFluxes_U = HuDotN * uR; RightFluxes_V = HuDotN * vR; // Normal Momentum flux term RightFluxes_N = HuDotN * uRn; - RightFluxes_U += (.5f * g * edgeNormals[0] ) * ( hR * hR); - RightFluxes_V += (.5f * g * edgeNormals[1] ) * ( hR * hR* cos(M_PI*edgeCenters[1]/180.0)); - RightFluxes_N += (.5f * g ) * ( hR * hR ); + + RightFluxes_U += ((.5f * g * edgeNormals[0] ) * ( hR * hR ))/(cos(M_PI*rightcellCenters[1]/180.0)); + RightFluxes_V += (.5f * g * edgeNormals[1] ) * ( hR * hR ); + // RightFluxes_N += (.5f * g ) * ( hR * hR ); // ------------------------------------------------------------------------ // HLLC Flux Solver (Batten et al. 1997) // "On the choice of wavespeeds for the HLLC Reimann solver" @@ -210,7 +212,7 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight, //------------------------------------------------------------------------ // HLL Flux Solver // ------------------------------------------------------------------------ - float sLMinus = sL < 0.0f ? sL : 0.0f; + /*float sLMinus = sL < 0.0f ? sL : 0.0f; float sRPlus = sR > 0.0f ? sR : 0.0f; float sRMinussL = sRPlus - sLMinus; sRMinussL = sRMinussL < EPS ? EPS : sRMinussL; @@ -221,65 +223,67 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight, ( t1 * LeftFluxes_H ) + ( t2 * RightFluxes_H ) + ( t3 * ( hR - hL ) ); - // op_printf("out %f \n", out[0]); + out[1] = ( t1 * LeftFluxes_U ) + ( t2 * RightFluxes_U ) + - ( t3 * ( (hR*uR) - (hL*uL) ) ); + ( t3 * ( (rightCellValues[1]) - + (leftCellValues[1]) ) ); out[2] = ( t1 * LeftFluxes_V ) + ( t2 * RightFluxes_V ) + - ( t3 * ( (hR*vR) - (hL*vL) ) ); - + ( t3 * ( (rightCellValues[2]) - + (leftCellValues[2]) ) ); + */ // ------------------------------------------------------------------------ // HLLC Flux Solver (Huang et al. 2013) // "Well-balanced finite volume scheme for shallow water flooding and drying // over arbitrary topography" // ------------------------------------------------------------------------ - // float sLMinus = sL < 0.0f ? sL : 0.0f; - // float sRPlus = sR > 0.0f ? sR : 0.0f; - // float sRMinussL = sRPlus - sLMinus; - // sRMinussL = sRMinussL < EPS ? EPS : sRMinussL; - // float t1 = sRPlus / sRMinussL; - // float t2 = ( -1.0 * sLMinus ) / sRMinussL; - // float t3 = ( sRPlus * sLMinus ) / sRMinussL; - // float FStar[3]; - // FStar[0] = - // ( t1 * LeftFluxes_H ) + - // ( t2 * RightFluxes_H ) + - // ( t3 * ( hR - hL ) ); - // - // - // FStar[1] = - // ( t1 * LeftFluxes_N ) + - // ( t2 * RightFluxes_N ) + - // ( t3 * ( (hR * uRn) - - // (hL * uLn) ) ); - // - // if( sL >= 0.0f) { - // out[0] = t1*LeftFluxes_H; - // out[1] = t1*LeftFluxes_U; - // out[2] = t1*LeftFluxes_V; - // } else if ((sL < 0.0f) && (sStar >= 0.0f)){ - // out[0] = FStar[0]; - // FStar[2] = FStar[0] * uLp; - // out[1] = FStar[1]*edgeNormals[0] - FStar[2]*edgeNormals[1]; - // out[2] = FStar[1]*edgeNormals[1] + FStar[2]*edgeNormals[0]; - // } else if((sStar < 0.0f) && (sR >= 0.0f)){ - // out[0] = FStar[0]; - // FStar[2] = FStar[0] * uRp; - // out[1] = FStar[1]*edgeNormals[0] - FStar[2]*edgeNormals[1]; - // out[2] = FStar[1]*edgeNormals[1] + FStar[2]*edgeNormals[0]; - // } else { - // out[0] = t2*RightFluxes_H; - // out[1] = t2*RightFluxes_U; - // out[2] = t2*RightFluxes_V; - // } + float sLMinus = sL < 0.0f ? sL : 0.0f; + float sRPlus = sR > 0.0f ? sR : 0.0f; + float sRMinussL = sRPlus - sLMinus; + sRMinussL = sRMinussL < EPS ? EPS : sRMinussL; + float t1 = sRPlus / sRMinussL; + float t2 = ( -1.0 * sLMinus ) / sRMinussL; + float t3 = ( sRPlus * sLMinus ) / sRMinussL; + float FStar[3]; + FStar[0] = + ( t1 * LeftFluxes_H ) + + ( t2 * RightFluxes_H ) + + ( t3 * ( hR - hL ) ); + + + FStar[1] = + ( t1 * LeftFluxes_N ) + + ( t2 * RightFluxes_N ) + + ( t3 * ( (hR * uRn) - + (hL * uLn) ) ); + + if( sL >= 0.0f) { + out[0] = t1*LeftFluxes_H; + out[1] = t1*LeftFluxes_U; + out[2] = t1*LeftFluxes_V; + } else if ((sL < 0.0f) && (sStar >= 0.0f)){ + out[0] = FStar[0]; + FStar[2] = FStar[0] * uLp; + out[1] = FStar[1]*edgeNormals[0] - FStar[2]*edgeNormals[1]; + out[2] = FStar[1]*edgeNormals[1] + FStar[2]*edgeNormals[0]; + } else if((sStar < 0.0f) && (sR >= 0.0f)){ + out[0] = FStar[0]; + FStar[2] = FStar[0] * uRp; + out[1] = FStar[1]*edgeNormals[0] - FStar[2]*edgeNormals[1]; + out[2] = FStar[1]*edgeNormals[1] + FStar[2]*edgeNormals[0]; + } else { + out[0] = t2*RightFluxes_H; + out[1] = t2*RightFluxes_U; + out[2] = t2*RightFluxes_V; + } out[0] *= *edgeLength; out[1] *= *edgeLength; out[2] *= *edgeLength; - + // op_printf("Length %f \n", *edgeLength); float maximum = fabs(uLn + cL); maximum = maximum > fabs(uLn - cL) ? maximum : fabs(uLn - cL); maximum = maximum > fabs(uRn + cR) ? maximum : fabs(uRn + cR); diff --git a/sp/cuda/EvolveValuesRK2_1_kernel.cu b/sp/cuda/EvolveValuesRK2_1_kernel.cu index 5cc23d8..6f3cb62 100644 --- a/sp/cuda/EvolveValuesRK2_1_kernel.cu +++ b/sp/cuda/EvolveValuesRK2_1_kernel.cu @@ -3,42 +3,26 @@ // //user function -__device__ -inline void EvolveValuesRK2_1_gpu(const float *dT, float *midPointConservative, //OP_RW //temp - const float *in, //OP_READ - float *inConservative, //OP_WRITE //temp - float *midPoint) //OP_WRITE -{ - midPointConservative[0] *= *dT; - midPointConservative[1] *= *dT; - midPointConservative[2] *= *dT; - - //call to ToConservativeVariables inlined - inConservative[0] = in[0]; - inConservative[1] = in[0] * (in[1]); - inConservative[2] = in[0] * (in[2]); - inConservative[3] = in[3]; - - midPointConservative[0] += inConservative[0]; - midPointConservative[1] += inConservative[1]; - midPointConservative[2] += inConservative[2]; - midPointConservative[3] += inConservative[3]; - - //call to ToPhysicalVariables inlined - float TruncatedH = midPointConservative[0] < EPS ? EPS : midPointConservative[0]; - midPoint[0] = midPointConservative[0]; - midPoint[1] = midPointConservative[1] / TruncatedH; - midPoint[2] = midPointConservative[2] / TruncatedH; - midPoint[3] = midPointConservative[3]; +__device__ void EvolveValuesRK2_1_gpu( const float *dT, const float *Lw_n, + const float *in, + float *out) { + out[0] = Lw_n[0] * *dT + in[0]; + out[1] = Lw_n[1] * *dT + in[1]; + out[2] = Lw_n[2] * *dT + in[2]; + out[3] = in[3]-in[0]; + + float TruncatedH = out[0] < EPS_cuda ? EPS_cuda : out[0]; + out[0] = TruncatedH; + out[3] += TruncatedH; + } // CUDA kernel function __global__ void op_cuda_EvolveValuesRK2_1( const float *arg0, - float *arg1, + const float *__restrict arg1, const float *__restrict arg2, float *arg3, - float *arg4, int set_size ) { @@ -49,29 +33,26 @@ __global__ void op_cuda_EvolveValuesRK2_1( EvolveValuesRK2_1_gpu(arg0, arg1+n*4, arg2+n*4, - arg3+n*4, - arg4+n*4); + arg3+n*4); } } -//GPU host stub function -void op_par_loop_EvolveValuesRK2_1_gpu(char const *name, op_set set, +//host stub function +void op_par_loop_EvolveValuesRK2_1(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, - op_arg arg3, - op_arg arg4){ + op_arg arg3){ float*arg0h = (float *)arg0.data; - int nargs = 5; - op_arg args[5]; + int nargs = 4; + op_arg args[4]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; - args[4] = arg4; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; @@ -79,15 +60,14 @@ void op_par_loop_EvolveValuesRK2_1_gpu(char const *name, op_set set, op_timers_core(&cpu_t1, &wall_t1); OP_kernels[0].name = name; OP_kernels[0].count += 1; - if (OP_kernels[0].count==1) op_register_strides(); if (OP_diags>2) { printf(" kernel routine w/o indirection: EvolveValuesRK2_1"); } - op_mpi_halo_exchanges_cuda(set, nargs, args); - if (set->size > 0) { + int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2); + if (set_size > 0) { //transfer constants to GPU int consts_bytes = 0; @@ -107,7 +87,6 @@ void op_par_loop_EvolveValuesRK2_1_gpu(char const *name, op_set set, int nthread = OP_BLOCK_SIZE_0; #else int nthread = OP_block_size; - // int nthread = 128; #endif int nblocks = 200; @@ -117,69 +96,16 @@ void op_par_loop_EvolveValuesRK2_1_gpu(char const *name, op_set set, (float *) arg1.data_d, (float *) arg2.data_d, (float *) arg3.data_d, - (float *) arg4.data_d, set->size ); } op_mpi_set_dirtybit_cuda(nargs, args); - cutilSafeCall(cudaDeviceSynchronize()); + if (OP_diags>1) { + cutilSafeCall(cudaDeviceSynchronize()); + } //update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].time += wall_t2 - wall_t1; - OP_kernels[0].transfer += (float)set->size * arg1.size * 2.0f; + OP_kernels[0].transfer += (float)set->size * arg1.size; OP_kernels[0].transfer += (float)set->size * arg2.size; - OP_kernels[0].transfer += (float)set->size * arg3.size; - OP_kernels[0].transfer += (float)set->size * arg4.size; + OP_kernels[0].transfer += (float)set->size * arg3.size * 2.0f; } - -void op_par_loop_EvolveValuesRK2_1_cpu(char const *name, op_set set, - op_arg arg0, - op_arg arg1, - op_arg arg2, - op_arg arg3, - op_arg arg4); - - -//GPU host stub function -#if OP_HYBRID_GPU -void op_par_loop_EvolveValuesRK2_1(char const *name, op_set set, - op_arg arg0, - op_arg arg1, - op_arg arg2, - op_arg arg3, - op_arg arg4){ - - if (OP_hybrid_gpu) { - op_par_loop_EvolveValuesRK2_1_gpu(name, set, - arg0, - arg1, - arg2, - arg3, - arg4); - - }else{ - op_par_loop_EvolveValuesRK2_1_cpu(name, set, - arg0, - arg1, - arg2, - arg3, - arg4); - - } -} -#else -void op_par_loop_EvolveValuesRK2_1(char const *name, op_set set, - op_arg arg0, - op_arg arg1, - op_arg arg2, - op_arg arg3, - op_arg arg4){ - - op_par_loop_EvolveValuesRK2_1_gpu(name, set, - arg0, - arg1, - arg2, - arg3, - arg4); - - } -#endif //OP_HYBRID_GPU diff --git a/sp/cuda/EvolveValuesRK2_2_kernel.cu b/sp/cuda/EvolveValuesRK2_2_kernel.cu index a2cdfcd..f0a89d9 100644 --- a/sp/cuda/EvolveValuesRK2_2_kernel.cu +++ b/sp/cuda/EvolveValuesRK2_2_kernel.cu @@ -3,32 +3,24 @@ // //user function -__device__ -inline void EvolveValuesRK2_2_gpu(const float *dT, float *outConservative, //OP_RW, discard - const float *inConservative, //OP_READ, discard - const float *midPointConservative, //OP_READ, discard - float *out) //OP_WRITE - -{ - outConservative[0] = 0.5*(outConservative[0] * *dT + midPointConservative[0] + inConservative[0]); - outConservative[1] = 0.5*(outConservative[1] * *dT + midPointConservative[1] + inConservative[1]); - outConservative[2] = 0.5*(outConservative[2] * *dT + midPointConservative[2] + inConservative[2]); - - outConservative[0] = outConservative[0] <= EPS ? EPS : outConservative[0]; - outConservative[3] = inConservative[3]; - - //call to ToPhysicalVariables inlined - float TruncatedH = outConservative[0] < EPS ? EPS : outConservative[0]; - out[0] = outConservative[0]; - out[1] = outConservative[1] / TruncatedH; - out[2] = outConservative[2] / TruncatedH; - out[3] = outConservative[3]; +__device__ void EvolveValuesRK2_2_gpu( const float *dT,const float *Lw_1, + const float *values, + const float *w_1, + float *out) { + out[0] = 0.5*(Lw_1[0] * *dT + w_1[0] + values[0]); + out[1] = 0.5*(Lw_1[1] * *dT + w_1[1] + values[1]); + out[2] = 0.5*(Lw_1[2] * *dT + w_1[2] + values[2]); + out[3] = values[3]-values[0]; + float TruncatedH = out[0] < EPS_cuda ? EPS_cuda : out[0]; + out[0] = TruncatedH; + out[3] += TruncatedH; + } // CUDA kernel function __global__ void op_cuda_EvolveValuesRK2_2( const float *arg0, - float *arg1, + const float *__restrict arg1, const float *__restrict arg2, const float *__restrict arg3, float *arg4, @@ -48,8 +40,8 @@ __global__ void op_cuda_EvolveValuesRK2_2( } -//GPU host stub function -void op_par_loop_EvolveValuesRK2_2_gpu(char const *name, op_set set, +//host stub function +void op_par_loop_EvolveValuesRK2_2(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, @@ -72,15 +64,14 @@ void op_par_loop_EvolveValuesRK2_2_gpu(char const *name, op_set set, op_timers_core(&cpu_t1, &wall_t1); OP_kernels[1].name = name; OP_kernels[1].count += 1; - if (OP_kernels[1].count==1) op_register_strides(); if (OP_diags>2) { printf(" kernel routine w/o indirection: EvolveValuesRK2_2"); } - op_mpi_halo_exchanges_cuda(set, nargs, args); - if (set->size > 0) { + int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2); + if (set_size > 0) { //transfer constants to GPU int consts_bytes = 0; @@ -100,7 +91,6 @@ void op_par_loop_EvolveValuesRK2_2_gpu(char const *name, op_set set, int nthread = OP_BLOCK_SIZE_1; #else int nthread = OP_block_size; - // int nthread = 128; #endif int nblocks = 200; @@ -114,65 +104,14 @@ void op_par_loop_EvolveValuesRK2_2_gpu(char const *name, op_set set, set->size ); } op_mpi_set_dirtybit_cuda(nargs, args); - cutilSafeCall(cudaDeviceSynchronize()); + if (OP_diags>1) { + cutilSafeCall(cudaDeviceSynchronize()); + } //update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[1].time += wall_t2 - wall_t1; - OP_kernels[1].transfer += (float)set->size * arg1.size * 2.0f; + OP_kernels[1].transfer += (float)set->size * arg1.size; OP_kernels[1].transfer += (float)set->size * arg2.size; OP_kernels[1].transfer += (float)set->size * arg3.size; - OP_kernels[1].transfer += (float)set->size * arg4.size; + OP_kernels[1].transfer += (float)set->size * arg4.size * 2.0f; } - -void op_par_loop_EvolveValuesRK2_2_cpu(char const *name, op_set set, - op_arg arg0, - op_arg arg1, - op_arg arg2, - op_arg arg3, - op_arg arg4); - - -//GPU host stub function -#if OP_HYBRID_GPU -void op_par_loop_EvolveValuesRK2_2(char const *name, op_set set, - op_arg arg0, - op_arg arg1, - op_arg arg2, - op_arg arg3, - op_arg arg4){ - - if (OP_hybrid_gpu) { - op_par_loop_EvolveValuesRK2_2_gpu(name, set, - arg0, - arg1, - arg2, - arg3, - arg4); - - }else{ - op_par_loop_EvolveValuesRK2_2_cpu(name, set, - arg0, - arg1, - arg2, - arg3, - arg4); - - } -} -#else -void op_par_loop_EvolveValuesRK2_2(char const *name, op_set set, - op_arg arg0, - op_arg arg1, - op_arg arg2, - op_arg arg3, - op_arg arg4){ - - op_par_loop_EvolveValuesRK2_2_gpu(name, set, - arg0, - arg1, - arg2, - arg3, - arg4); - - } -#endif //OP_HYBRID_GPU diff --git a/sp/cuda/Friction_manning_kernel.cu b/sp/cuda/Friction_manning_kernel.cu new file mode 100644 index 0000000..9f7f9f8 --- /dev/null +++ b/sp/cuda/Friction_manning_kernel.cu @@ -0,0 +1,125 @@ +// +// auto-generated by op2.py +// + +//user function +__device__ void Friction_manning_gpu( const float *dT,const float *M_n, + float *values) { + float Fr; + float TruncatedH = values[0] < EPS_cuda ? EPS_cuda : values[0]; + float u = values[1]/TruncatedH; + float v = values[2]/TruncatedH; + float speed = sqrt(u*u + v*v); + Fr = g_cuda* (*M_n * *M_n) *speed; + Fr = Fr/(pow(TruncatedH,4.0f/3.0f)); + + + + values[0] = TruncatedH; + values[3] = values[3]; + if (values[0] <= EPS_cuda){ + values[1] = 0.0f; + values[2] = 0.0f; + } else if (EPS_cuda < values[0] <= 50.0f) { + values[1] = values[1] / (1.0f + Fr * *dT); + values[2] = values[2] / (1.0f + Fr * *dT); + } else { + values[1] = values[1]; + values[2] = values[2]; + } + +} + +// CUDA kernel function +__global__ void op_cuda_Friction_manning( + const float *arg0, + const float *arg1, + float *arg2, + int set_size ) { + + + //process set elements + for ( int n=threadIdx.x+blockIdx.x*blockDim.x; n2) { + printf(" kernel routine w/o indirection: Friction_manning"); + } + + int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2); + if (set_size > 0) { + + //transfer constants to GPU + int consts_bytes = 0; + consts_bytes += ROUND_UP(1*sizeof(float)); + consts_bytes += ROUND_UP(1*sizeof(float)); + reallocConstArrays(consts_bytes); + consts_bytes = 0; + arg0.data = OP_consts_h + consts_bytes; + arg0.data_d = OP_consts_d + consts_bytes; + for ( int d=0; d<1; d++ ){ + ((float *)arg0.data)[d] = arg0h[d]; + } + consts_bytes += ROUND_UP(1*sizeof(float)); + arg1.data = OP_consts_h + consts_bytes; + arg1.data_d = OP_consts_d + consts_bytes; + for ( int d=0; d<1; d++ ){ + ((float *)arg1.data)[d] = arg1h[d]; + } + consts_bytes += ROUND_UP(1*sizeof(float)); + mvConstArraysToDevice(consts_bytes); + + //set CUDA execution parameters + #ifdef OP_BLOCK_SIZE_2 + int nthread = OP_BLOCK_SIZE_2; + #else + int nthread = OP_block_size; + #endif + + int nblocks = 200; + + op_cuda_Friction_manning<<>>( + (float *) arg0.data_d, + (float *) arg1.data_d, + (float *) arg2.data_d, + set->size ); + } + op_mpi_set_dirtybit_cuda(nargs, args); + if (OP_diags>1) { + cutilSafeCall(cudaDeviceSynchronize()); + } + //update kernel record + op_timers_core(&cpu_t2, &wall_t2); + OP_kernels[2].time += wall_t2 - wall_t1; + OP_kernels[2].transfer += (float)set->size * arg2.size * 2.0f; +} diff --git a/sp/cuda/NumericalFluxes_kernel.cu b/sp/cuda/NumericalFluxes_kernel.cu index a93ef4a..41f2d39 100644 --- a/sp/cuda/NumericalFluxes_kernel.cu +++ b/sp/cuda/NumericalFluxes_kernel.cu @@ -3,34 +3,48 @@ // //user function -__device__ void NumericalFluxes_gpu( const float *maxEdgeEigenvalues0, - const float *maxEdgeEigenvalues1, - const float *maxEdgeEigenvalues2, - const float *EdgeVolumes0, - const float *EdgeVolumes1, - const float *EdgeVolumes2, - const float *cellVolumes, - float *zeroInit, float *minTimeStep ) { - float local = 0.0f; - local += *maxEdgeEigenvalues0 * *(EdgeVolumes0); - local += *maxEdgeEigenvalues1 * *(EdgeVolumes1); - local += *maxEdgeEigenvalues2 * *(EdgeVolumes2); - zeroInit[0] = 0.0f; - zeroInit[1] = 0.0f; - zeroInit[2] = 0.0f; - zeroInit[3] = 0.0f; - - *minTimeStep = MIN(*minTimeStep, 2.0f * *cellVolumes / local); +__device__ void NumericalFluxes_gpu( float *left, + float *right, + const float *edgeFluxes, + const float *bathySource, + const float *edgeNormals, const int *isRightBoundary, + const float *cellVolumes0, + const float *cellVolumes1) { + left[0] -= (edgeFluxes[0])/cellVolumes0[0]; + left[1] -= (edgeFluxes[1] + bathySource[0] * edgeNormals[0])/cellVolumes0[0]; + left[2] -= (edgeFluxes[2] + bathySource[0] * edgeNormals[1])/cellVolumes0[0]; + + left[1] += (bathySource[2] *edgeNormals[0])/cellVolumes0[0]; + left[2] += (bathySource[2] *edgeNormals[1])/cellVolumes0[0]; + if (!*isRightBoundary) { + right[0] += edgeFluxes[0]/cellVolumes1[0]; + right[1] += (edgeFluxes[1] + bathySource[1] * edgeNormals[0])/cellVolumes1[0]; + right[2] += (edgeFluxes[2] + bathySource[1] * edgeNormals[1])/cellVolumes1[0]; + + right[1] -= (bathySource[3] *edgeNormals[0])/cellVolumes1[0]; + right[2] -= (bathySource[3] *edgeNormals[1])/cellVolumes1[0]; + } else { + right[0] += 0.0f; + right[1] += 0.0f; + right[2] += 0.0f; + } + + } // CUDA kernel function __global__ void op_cuda_NumericalFluxes( - const float *__restrict ind_arg0, + float *__restrict ind_arg0, const float *__restrict ind_arg1, const int *__restrict opDat0Map, - const float *__restrict arg6, - float *arg7, - float *arg8, + const float *__restrict arg2, + const float *__restrict arg3, + const float *__restrict arg4, + const int *__restrict arg5, + int *ind_map, + short *arg_map, + int *ind_arg_sizes, + int *ind_arg_offs, int block_offset, int *blkmap, int *offset, @@ -39,11 +53,13 @@ __global__ void op_cuda_NumericalFluxes( int *colors, int nblocks, int set_size) { - float arg8_l[1]; - for ( int d=0; d<1; d++ ){ - arg8_l[d]=arg8[d+blockIdx.x*1]; - } + float arg0_l[4]; + float arg1_l[4]; + __shared__ int *ind_arg0_map, ind_arg0_size; + __shared__ float *ind_arg0_s; + + __shared__ int nelems2, ncolor; __shared__ int nelem, offset_b; extern __shared__ char shared[]; @@ -60,34 +76,86 @@ __global__ void op_cuda_NumericalFluxes( nelem = nelems[blockId]; offset_b = offset[blockId]; + nelems2 = blockDim.x*(1+(nelem-1)/blockDim.x); + ncolor = ncolors[blockId]; + + ind_arg0_size = ind_arg_sizes[0+blockId*1]; + + ind_arg0_map = &ind_map[0*set_size] + ind_arg_offs[0+blockId*1]; + + //set shared memory pointers + int nbytes = 0; + ind_arg0_s = (float *) &shared[nbytes]; } __syncthreads(); // make sure all of above completed - for ( int n=threadIdx.x; n=0) { + arg0_map = arg_map[0*set_size+n+offset_b]; + arg1_map = arg_map[1*set_size+n+offset_b]; + } - for ( int d=0; d<1; d++ ){ - op_reduction(&arg8[d+blockIdx.x*1],arg8_l[d]); + for ( int col=0; col2) { printf(" kernel routine with indirection: NumericalFluxes\n"); } //get plan - #ifdef OP_PART_SIZE_24 - int part_size = OP_PART_SIZE_24; + #ifdef OP_PART_SIZE_26 + int part_size = OP_PART_SIZE_26; #else int part_size = OP_part_size; #endif - int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args); - if (set->size > 0) { + int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2); + if (set_size > 0) { - op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); - - //transfer global reduction data to GPU - int maxblocks = 0; - for ( int col=0; colncolors; col++ ){ - maxblocks = MAX(maxblocks,Plan->ncolblk[col]); - } - int reduct_bytes = 0; - int reduct_size = 0; - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(float)); - reduct_size = MAX(reduct_size,sizeof(float)); - reallocReductArrays(reduct_bytes); - reduct_bytes = 0; - arg8.data = OP_reduct_h + reduct_bytes; - arg8.data_d = OP_reduct_d + reduct_bytes; - for ( int b=0; bncolors; col++ ){ if (col==Plan->ncolors_core) { - op_mpi_wait_all_cuda(nargs, args); + op_mpi_wait_all_grouped(nargs, args, 2); } - #ifdef OP_BLOCK_SIZE_24 - int nthread = OP_BLOCK_SIZE_24; + #ifdef OP_BLOCK_SIZE_26 + int nthread = OP_BLOCK_SIZE_26; #else int nthread = OP_block_size; #endif @@ -182,14 +226,19 @@ void op_par_loop_NumericalFluxes(char const *name, op_set set, dim3 nblocks = dim3(Plan->ncolblk[col] >= (1<<16) ? 65535 : Plan->ncolblk[col], Plan->ncolblk[col] >= (1<<16) ? (Plan->ncolblk[col]-1)/65535+1: 1, 1); if (Plan->ncolblk[col] > 0) { - int nshared = reduct_size*nthread; + int nshared = Plan->nsharedCol[col]; op_cuda_NumericalFluxes<<>>( (float *)arg0.data_d, - (float *)arg3.data_d, + (float *)arg6.data_d, arg0.map_data_d, - (float*)arg6.data_d, - (float*)arg7.data_d, - (float*)arg8.data_d, + (float*)arg2.data_d, + (float*)arg3.data_d, + (float*)arg4.data_d, + (int*)arg5.data_d, + Plan->ind_map, + Plan->loc_map, + Plan->ind_sizes, + Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, @@ -199,26 +248,17 @@ void op_par_loop_NumericalFluxes(char const *name, op_set set, Plan->ncolblk[col], set->size+set->exec_size); - //transfer global reduction data back to CPU - if (col == Plan->ncolors_owned-1) { - mvReductArraysToHost(reduct_bytes); - } } block_offset += Plan->ncolblk[col]; } - OP_kernels[24].transfer += Plan->transfer; - OP_kernels[24].transfer2 += Plan->transfer2; - for ( int b=0; btransfer; + OP_kernels[26].transfer2 += Plan->transfer2; } op_mpi_set_dirtybit_cuda(nargs, args); - cutilSafeCall(cudaDeviceSynchronize()); + if (OP_diags>1) { + cutilSafeCall(cudaDeviceSynchronize()); + } //update kernel record op_timers_core(&cpu_t2, &wall_t2); - OP_kernels[24].time += wall_t2 - wall_t1; + OP_kernels[26].time += wall_t2 - wall_t1; } diff --git a/sp/cuda/Timestep_kernel.cu b/sp/cuda/Timestep_kernel.cu new file mode 100644 index 0000000..6675f74 --- /dev/null +++ b/sp/cuda/Timestep_kernel.cu @@ -0,0 +1,218 @@ +// +// auto-generated by op2.py +// + +//user function +__device__ void Timestep_gpu( const float *maxEdgeEigenvalues0, + const float *maxEdgeEigenvalues1, + const float *maxEdgeEigenvalues2, + const float *EdgeVolumes0, + const float *EdgeVolumes1, + const float *EdgeVolumes2, + const float *cellVolumes, + float *minTimeStep ) { + float local = 0.0f; + local += *maxEdgeEigenvalues0 * *(EdgeVolumes0); + local += *maxEdgeEigenvalues1 * *(EdgeVolumes1); + local += *maxEdgeEigenvalues2 * *(EdgeVolumes2); + *minTimeStep = MIN(*minTimeStep, 2.0f * *cellVolumes / local); + +} + +// CUDA kernel function +__global__ void op_cuda_Timestep( + const float *__restrict ind_arg0, + const float *__restrict ind_arg1, + const int *__restrict opDat0Map, + const float *__restrict arg6, + float *arg7, + int block_offset, + int *blkmap, + int *offset, + int *nelems, + int *ncolors, + int *colors, + int nblocks, + int set_size) { + float arg7_l[1]; + for ( int d=0; d<1; d++ ){ + arg7_l[d]=arg7[d+blockIdx.x*1]; + } + + + __shared__ int nelem, offset_b; + + extern __shared__ char shared[]; + + if (blockIdx.x+blockIdx.y*gridDim.x >= nblocks) { + return; + } + if (threadIdx.x==0) { + + //get sizes and shift pointers and direct-mapped data + + int blockId = blkmap[blockIdx.x + blockIdx.y*gridDim.x + block_offset]; + + nelem = nelems[blockId]; + offset_b = offset[blockId]; + + } + __syncthreads(); // make sure all of above completed + + for ( int n=threadIdx.x; n(&arg7[d+blockIdx.x*1],arg7_l[d]); + } +} + + +//host stub function +void op_par_loop_Timestep(char const *name, op_set set, + op_arg arg0, + op_arg arg1, + op_arg arg2, + op_arg arg3, + op_arg arg4, + op_arg arg5, + op_arg arg6, + op_arg arg7){ + + float*arg7h = (float *)arg7.data; + int nargs = 8; + op_arg args[8]; + + args[0] = arg0; + args[1] = arg1; + args[2] = arg2; + args[3] = arg3; + args[4] = arg4; + args[5] = arg5; + args[6] = arg6; + args[7] = arg7; + + // initialise timers + double cpu_t1, cpu_t2, wall_t1, wall_t2; + op_timing_realloc(25); + op_timers_core(&cpu_t1, &wall_t1); + OP_kernels[25].name = name; + OP_kernels[25].count += 1; + + + int ninds = 2; + int inds[8] = {0,0,0,1,1,1,-1,-1}; + + if (OP_diags>2) { + printf(" kernel routine with indirection: Timestep\n"); + } + + //get plan + #ifdef OP_PART_SIZE_25 + int part_size = OP_PART_SIZE_25; + #else + int part_size = OP_part_size; + #endif + + int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2); + if (set_size > 0) { + + op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); + + //transfer global reduction data to GPU + int maxblocks = 0; + for ( int col=0; colncolors; col++ ){ + maxblocks = MAX(maxblocks,Plan->ncolblk[col]); + } + int reduct_bytes = 0; + int reduct_size = 0; + reduct_bytes += ROUND_UP(maxblocks*1*sizeof(float)); + reduct_size = MAX(reduct_size,sizeof(float)); + reallocReductArrays(reduct_bytes); + reduct_bytes = 0; + arg7.data = OP_reduct_h + reduct_bytes; + arg7.data_d = OP_reduct_d + reduct_bytes; + for ( int b=0; bncolors; col++ ){ + if (col==Plan->ncolors_core) { + op_mpi_wait_all_grouped(nargs, args, 2); + } + #ifdef OP_BLOCK_SIZE_25 + int nthread = OP_BLOCK_SIZE_25; + #else + int nthread = OP_block_size; + #endif + + dim3 nblocks = dim3(Plan->ncolblk[col] >= (1<<16) ? 65535 : Plan->ncolblk[col], + Plan->ncolblk[col] >= (1<<16) ? (Plan->ncolblk[col]-1)/65535+1: 1, 1); + if (Plan->ncolblk[col] > 0) { + int nshared = MAX(Plan->nshared,reduct_size*nthread); + op_cuda_Timestep<<>>( + (float *)arg0.data_d, + (float *)arg3.data_d, + arg0.map_data_d, + (float*)arg6.data_d, + (float*)arg7.data_d, + block_offset, + Plan->blkmap, + Plan->offset, + Plan->nelems, + Plan->nthrcol, + Plan->thrcol, + Plan->ncolblk[col], + set->size+set->exec_size); + + //transfer global reduction data back to CPU + if (col == Plan->ncolors_owned-1) { + mvReductArraysToHost(reduct_bytes); + } + } + block_offset += Plan->ncolblk[col]; + } + OP_kernels[25].transfer += Plan->transfer; + OP_kernels[25].transfer2 += Plan->transfer2; + for ( int b=0; b1) { + cutilSafeCall(cudaDeviceSynchronize()); + } + //update kernel record + op_timers_core(&cpu_t2, &wall_t2); + OP_kernels[25].time += wall_t2 - wall_t1; +} diff --git a/sp/cuda/applyConst_kernel.cu b/sp/cuda/applyConst_kernel.cu index 8acabea..eccfb93 100644 --- a/sp/cuda/applyConst_kernel.cu +++ b/sp/cuda/applyConst_kernel.cu @@ -17,9 +17,10 @@ __device__ void applyConst_gpu( const float *in, float *out, const int *variable } if (*variables & 8) { - out[3] = *in; + *out = *in; } + } // CUDA kernel function @@ -35,7 +36,7 @@ __global__ void op_cuda_applyConst( //user-supplied kernel call applyConst_gpu(arg0+n*1, - arg1+n*4, + arg1+n*1, arg2); } } @@ -57,18 +58,18 @@ void op_par_loop_applyConst(char const *name, op_set set, // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; - op_timing_realloc(10); + op_timing_realloc(8); op_timers_core(&cpu_t1, &wall_t1); - OP_kernels[10].name = name; - OP_kernels[10].count += 1; + OP_kernels[8].name = name; + OP_kernels[8].count += 1; if (OP_diags>2) { printf(" kernel routine w/o indirection: applyConst"); } - op_mpi_halo_exchanges_cuda(set, nargs, args); - if (set->size > 0) { + int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2); + if (set_size > 0) { //transfer constants to GPU int consts_bytes = 0; @@ -84,11 +85,10 @@ void op_par_loop_applyConst(char const *name, op_set set, mvConstArraysToDevice(consts_bytes); //set CUDA execution parameters - #ifdef OP_BLOCK_SIZE_10 - int nthread = OP_BLOCK_SIZE_10; + #ifdef OP_BLOCK_SIZE_8 + int nthread = OP_BLOCK_SIZE_8; #else int nthread = OP_block_size; - // int nthread = 128; #endif int nblocks = 200; @@ -100,10 +100,12 @@ void op_par_loop_applyConst(char const *name, op_set set, set->size ); } op_mpi_set_dirtybit_cuda(nargs, args); - cutilSafeCall(cudaDeviceSynchronize()); + if (OP_diags>1) { + cutilSafeCall(cudaDeviceSynchronize()); + } //update kernel record op_timers_core(&cpu_t2, &wall_t2); - OP_kernels[10].time += wall_t2 - wall_t1; - OP_kernels[10].transfer += (float)set->size * arg0.size; - OP_kernels[10].transfer += (float)set->size * arg1.size * 2.0f; + OP_kernels[8].time += wall_t2 - wall_t1; + OP_kernels[8].transfer += (float)set->size * arg0.size; + OP_kernels[8].transfer += (float)set->size * arg1.size * 2.0f; } diff --git a/sp/cuda/computeFluxes_kernel.cu b/sp/cuda/computeFluxes_kernel.cu index fb203d9..983eace 100644 --- a/sp/cuda/computeFluxes_kernel.cu +++ b/sp/cuda/computeFluxes_kernel.cu @@ -11,11 +11,13 @@ __device__ void computeFluxes_gpu( const float *cellLeft, const float *cellRight const float *leftGradient, const float *rightGradient, const int *isRightBoundary, float *bathySource, float *out, - float *maxEdgeEigenvalues) { + float *maxEdgeEigenvalues, const float *zmin) { float leftCellValues[4]; float rightCellValues[4]; float InterfaceBathy; + float zL, zR; + float uR,vR,uL,vL; leftCellValues[0] = cellLeft[0]; leftCellValues[1] = cellLeft[1]; leftCellValues[2] = cellLeft[2]; @@ -23,65 +25,86 @@ __device__ void computeFluxes_gpu( const float *cellLeft, const float *cellRight float dxl, dyl, dxr, dyr; dxl = (edgeCenters[0] - leftcellCenters[0]); dyl = (edgeCenters[1] - leftcellCenters[1]); - dxr = (edgeCenters[0] - rightcellCenters[0]); dyr = (edgeCenters[1] - rightcellCenters[1]); + leftCellValues[0] += alphaleft[0] * ((dxl * leftGradient[0])+(dyl * leftGradient[1])); + leftCellValues[1] += alphaleft[0] * ((dxl * leftGradient[2])+(dyl * leftGradient[3])); + leftCellValues[2] += alphaleft[0] * ((dxl * leftGradient[4])+(dyl * leftGradient[5])); + leftCellValues[3] += alphaleft[0] * ((dxl * leftGradient[6])+(dyl * leftGradient[7])); + if (leftCellValues[0] >= 1e-3){ + uL = leftCellValues[1]/leftCellValues[0]; + vL = leftCellValues[2]/leftCellValues[0]; + } else { + uL = 0.0f; + vL = 0.0f; + } + zL = cellLeft[3] - cellLeft[0]; + zR = cellRight[3] - cellRight[0]; if (!*isRightBoundary) { rightCellValues[0] = cellRight[0]; rightCellValues[1] = cellRight[1]; rightCellValues[2] = cellRight[2]; rightCellValues[3] = cellRight[3]; - } else { - rightCellValues[3] = cellLeft[3]; + rightCellValues[0] += alpharight[0] * ((dxr * rightGradient[0])+(dyr * rightGradient[1])); + rightCellValues[1] += alpharight[0] * ((dxr * rightGradient[2])+(dyr * rightGradient[3])); + rightCellValues[2] += alpharight[0] * ((dxr * rightGradient[4])+(dyr * rightGradient[5])); + rightCellValues[3] += alpharight[0] * ((dxr * rightGradient[6])+(dyr * rightGradient[7])); + if (rightCellValues[0] >= 1e-3){ + uR = rightCellValues[1]/rightCellValues[0]; + vR = rightCellValues[2]/rightCellValues[0]; + } else { + uR = 0.0f; + vR = 0.0f; + } + } else { float nx = edgeNormals[0]; float ny = edgeNormals[1]; - float inNormalVelocity = cellLeft[1] * nx + cellLeft[2] * ny; - float inTangentVelocity = -1.0f * cellLeft[1] * ny + cellLeft[2] * nx; + float inNormalVelocity = uL * nx + vL * ny; + float inTangentVelocity = -1.0f * uL * ny + vL * nx; float outNormalVelocity = 0.0f; float outTangentVelocity = 0.0f; - rightCellValues[0] = cellLeft[0]; - outNormalVelocity = -1.0f*inNormalVelocity; + float wet = (fabs(*zmin) - zL) > 0.0f ? (fabs(*zmin) - zL) : EPS_cuda; + float critical = sqrt(uL*uL + vL*vL); outTangentVelocity = inTangentVelocity; + if (critical < sqrt(g_cuda*leftCellValues[0])){ + rightCellValues[0] = wet; + rightCellValues[3] = wet + zL; + outNormalVelocity = 0.0f; + } else { + rightCellValues[0] = leftCellValues[0]; + rightCellValues[3] = leftCellValues[3]; + outNormalVelocity = inNormalVelocity; + } + - rightCellValues[1] = outNormalVelocity * nx - outTangentVelocity * ny; - rightCellValues[2] = outNormalVelocity * ny + outTangentVelocity * nx; + uR = outNormalVelocity * nx - outTangentVelocity * ny; + vR = outNormalVelocity * ny + outTangentVelocity * nx; + rightCellValues[1] = uR*rightCellValues[0]; + rightCellValues[2] = vR*rightCellValues[0]; + zR = zL; } + rightCellValues[3] -= rightCellValues[0]; + leftCellValues[3] -= leftCellValues[0]; - if (!*isRightBoundary) { - leftCellValues[0] += alphaleft[0] * ((dxl * leftGradient[0])+(dyl * leftGradient[1])); - leftCellValues[0] = leftCellValues[0] > 0.0f ? leftCellValues[0] : 0.0f; - leftCellValues[3] += alphaleft[0] * ((dxl * leftGradient[6])+(dyl * leftGradient[7])); - leftCellValues[1] += alphaleft[0] * ((dxl * leftGradient[2])+(dyl * leftGradient[3])); - leftCellValues[2] += alphaleft[0] * ((dxl * leftGradient[4])+(dyl * leftGradient[5])); - - rightCellValues[0] += alpharight[0] * ((dxr * rightGradient[0])+(dyr * rightGradient[1])); - rightCellValues[0] = rightCellValues[0] > 0.0f ? rightCellValues[0] : 0.0f; - rightCellValues[3] += alpharight[0] * ((dxr * rightGradient[6])+(dyr * rightGradient[7])); - rightCellValues[1] += alpharight[0] * ((dxr * rightGradient[2])+(dyr * rightGradient[3])); - rightCellValues[2] += alpharight[0] * ((dxr * rightGradient[4])+(dyr * rightGradient[5])); - } InterfaceBathy = leftCellValues[3] > rightCellValues[3] ? leftCellValues[3] : rightCellValues[3]; - bathySource[0] =.5f * g * (leftCellValues[0]*leftCellValues[0]); - bathySource[1] =.5f * g * (rightCellValues[0]*rightCellValues[0]); + bathySource[0] =0.5f * g_cuda * (leftCellValues[0]*leftCellValues[0]); + bathySource[1] =0.5f * g_cuda * (rightCellValues[0]*rightCellValues[0]); float hL = (leftCellValues[0] + leftCellValues[3] - InterfaceBathy); - hL = hL > 0.0f? hL : 0.0f; + hL = hL > 0.0f ? hL : 0.0f; float hR = (rightCellValues[0] + rightCellValues[3] - InterfaceBathy); hR = hR > 0.0f ? hR : 0.0f; - bathySource[0] -= .5f * g * (hL * hL); - bathySource[1] -= .5f * g * (hR * hR); - - bathySource[2] = -.5f * g *(leftCellValues[0] + cellLeft[0])*(leftCellValues[3] - cellLeft[3]); - bathySource[3] = -.5f * g *(rightCellValues[0] + cellRight[0])*(rightCellValues[3] - cellRight[3]); + bathySource[0] -= .5f * g_cuda * (hL * hL); + bathySource[1] -= .5f * g_cuda * (hR * hR); - leftCellValues[0] = hL; - rightCellValues[0] = hR; + bathySource[2] = -.5f * g_cuda *(leftCellValues[0] + cellLeft[0])*(leftCellValues[3] - zL); + bathySource[3] = -.5f * g_cuda *(rightCellValues[0] + cellRight[0])*(rightCellValues[3] - zR); bathySource[0] *= *edgeLength; bathySource[1] *= *edgeLength; @@ -90,13 +113,13 @@ __device__ void computeFluxes_gpu( const float *cellLeft, const float *cellRight - - float cL = sqrt(g * leftCellValues[0]); + float cL = sqrt(g_cuda * hL); cL = cL > 0.0f ? cL : 0.0f; - float cR = sqrt(g * rightCellValues[0]); + float cR = sqrt(g_cuda * hR); cR = cR > 0.0f ? cR : 0.0f; - float uLn = leftCellValues[1] * edgeNormals[0] + leftCellValues[2] * edgeNormals[1]; - float uRn = rightCellValues[1] * edgeNormals[0] + rightCellValues[2] * edgeNormals[1]; + + float uLn = uL * edgeNormals[0] + vL * edgeNormals[1]; + float uRn = uR * edgeNormals[0] + vR * edgeNormals[1]; float unStar = 0.5f * (uLn + uRn) + (cL-cR); float cStar = 0.5f * (cL + cR) - 0.25f* (uRn-uLn); @@ -104,85 +127,91 @@ __device__ void computeFluxes_gpu( const float *cellLeft, const float *cellRight float sR = (uRn + cR) > (unStar + cStar) ? (uRn + cR) : (unStar + cStar); float sStar; - sStar = (sL*rightCellValues[0]*(uRn - sR) - sR*leftCellValues[0]*(uLn - sL))/ - (rightCellValues[0]*(uRn - sR) - leftCellValues[0]*(uLn - sL)); + sStar = (sL*hR*(uRn - sR) - sR*hL*(uLn - sL))/ + (hR*(uRn - sR) - hL*(uLn - sL)); + + if ((leftCellValues[0] <= EPS_cuda) && (rightCellValues[0] > EPS_cuda)) { - if ((leftCellValues[0] <= EPS) && (rightCellValues[0] > EPS)) { sL = uRn - 2.0f*cR; sR = uRn + cR; sStar = sL; } - if ((rightCellValues[0] <= EPS) && (leftCellValues[0] > EPS)) { + if ((rightCellValues[0] <= EPS_cuda) && (leftCellValues[0] > EPS_cuda)) { + sR = uLn + 2.0f*cL; sL = uLn - cL; sStar = sR; } - float sLMinus = sL < 0.0f ? sL : 0.0f; - float sRPlus = sR > 0.0f ? sR : 0.0f; - float sRMinussL = sRPlus - sLMinus; - sRMinussL = sRMinussL < EPS ? EPS : sRMinussL; - float t1 = sRPlus / sRMinussL; + float uLp = vL*edgeNormals[0] - uL*edgeNormals[1]; + float uRp = vR*edgeNormals[0] - uR*edgeNormals[1]; + float LeftFluxes_H, LeftFluxes_N, LeftFluxes_U, LeftFluxes_V; - float t2 = ( -1.0 * sLMinus ) / sRMinussL; + float HuDotN = (hL*uL) * edgeNormals[0] + (hL*vL) * edgeNormals[1]; + LeftFluxes_H = HuDotN; + LeftFluxes_U = HuDotN * uL; + LeftFluxes_V = HuDotN * vL; - float t3 = ( sRPlus * sLMinus ) / sRMinussL; + LeftFluxes_N = HuDotN * uLn; + + LeftFluxes_U += (.5f * g_cuda * edgeNormals[0] ) * ( hL * hL ); + LeftFluxes_V += (.5f * g_cuda * edgeNormals[1] ) * ( hL * hL ); + LeftFluxes_N += (.5f * g_cuda ) * ( hL * hL ); + + + float RightFluxes_H,RightFluxes_N, RightFluxes_U, RightFluxes_V; + + HuDotN = (hR*uR) * edgeNormals[0] + (hR*vR) * edgeNormals[1]; + + RightFluxes_H = HuDotN; + RightFluxes_U = HuDotN * uR; + RightFluxes_V = HuDotN * vR; + + RightFluxes_N = HuDotN * uRn; + + RightFluxes_U += (.5f * g_cuda * edgeNormals[0] ) * ( hR * hR ); + RightFluxes_V += (.5f * g_cuda * edgeNormals[1] ) * ( hR * hR ); + RightFluxes_N += (.5f * g_cuda ) * ( hR * hR ); - float uLp = leftCellValues[2]*edgeNormals[0] - leftCellValues[1]*edgeNormals[1]; - float uRp = rightCellValues[2]*edgeNormals[0] - rightCellValues[1]*edgeNormals[1]; - float LeftFluxes_H, LeftFluxes_U, LeftFluxes_V, LeftFluxes_N; - float HuDotN = (leftCellValues[0] * leftCellValues[1]) * edgeNormals[0] + - (leftCellValues[0] * leftCellValues[2]) * edgeNormals[1]; - LeftFluxes_H = HuDotN; - LeftFluxes_U = HuDotN * leftCellValues[1]; - LeftFluxes_V = HuDotN * leftCellValues[2]; - LeftFluxes_N = HuDotN * uLn; - LeftFluxes_U += (.5f * g * edgeNormals[0] ) * ( leftCellValues[0] * leftCellValues[0] ); - LeftFluxes_V += (.5f * g * edgeNormals[1] ) * ( leftCellValues[0] * leftCellValues[0] ); - LeftFluxes_N += (.5f * g ) * ( leftCellValues[0] * leftCellValues[0] ); - float RightFluxes_H,RightFluxes_U, RightFluxes_V, RightFluxes_N; - HuDotN = (rightCellValues[0] * rightCellValues[1] * edgeNormals[0]) + - (rightCellValues[0] * rightCellValues[2] * edgeNormals[1]); - RightFluxes_H = HuDotN; - RightFluxes_U = HuDotN * rightCellValues[1]; - RightFluxes_V = HuDotN * rightCellValues[2]; - RightFluxes_N = HuDotN * uRn; - RightFluxes_U += (.5f * g * edgeNormals[0] ) * ( rightCellValues[0] * rightCellValues[0] ); - RightFluxes_V += (.5f * g * edgeNormals[1] ) * ( rightCellValues[0] * rightCellValues[0] ); - RightFluxes_N += (.5f * g) * ( rightCellValues[0] * rightCellValues[0] ); + float sLMinus = sL < 0.0f ? sL : 0.0f; + float sRPlus = sR > 0.0f ? sR : 0.0f; + float sRMinussL = sRPlus - sLMinus; + sRMinussL = sRMinussL < EPS_cuda ? EPS_cuda : sRMinussL; + float t1 = sRPlus / sRMinussL; + float t2 = ( -1.0 * sLMinus ) / sRMinussL; + float t3 = ( sRPlus * sLMinus ) / sRMinussL; float FStar[3]; FStar[0] = ( t1 * LeftFluxes_H ) + ( t2 * RightFluxes_H ) + - ( t3 * ( rightCellValues[0] - leftCellValues[0] ) ); + ( t3 * ( hR - hL ) ); FStar[1] = ( t1 * LeftFluxes_N ) + ( t2 * RightFluxes_N ) + - ( t3 * ( (rightCellValues[0] * uRn) - - (leftCellValues[0] * uLn) ) ); - + ( t3 * ( (hR * uRn) - + (hL * uLn) ) ); if( sL >= 0.0f) { - out[0] = LeftFluxes_H; - out[1] = LeftFluxes_U; - out[2] = LeftFluxes_V; + out[0] = t1*LeftFluxes_H; + out[1] = t1*LeftFluxes_U; + out[2] = t1*LeftFluxes_V; } else if ((sL < 0.0f) && (sStar >= 0.0f)){ out[0] = FStar[0]; FStar[2] = FStar[0] * uLp; @@ -194,18 +223,20 @@ __device__ void computeFluxes_gpu( const float *cellLeft, const float *cellRight out[1] = FStar[1]*edgeNormals[0] - FStar[2]*edgeNormals[1]; out[2] = FStar[1]*edgeNormals[1] + FStar[2]*edgeNormals[0]; } else { - out[0] = RightFluxes_H; - out[1] = RightFluxes_U; - out[2] = RightFluxes_V; + out[0] = t2*RightFluxes_H; + out[1] = t2*RightFluxes_U; + out[2] = t2*RightFluxes_V; } out[0] *= *edgeLength; out[1] *= *edgeLength; out[2] *= *edgeLength; + float maximum = fabs(uLn + cL); maximum = maximum > fabs(uLn - cL) ? maximum : fabs(uLn - cL); maximum = maximum > fabs(uRn + cR) ? maximum : fabs(uRn + cR); maximum = maximum > fabs(uRn - cR) ? maximum : fabs(uRn - cR); *maxEdgeEigenvalues = maximum; + } // CUDA kernel function @@ -222,6 +253,7 @@ __global__ void op_cuda_computeFluxes( float *arg12, float *arg13, float *arg14, + const float *arg15, int block_offset, int *blkmap, int *offset, @@ -231,6 +263,7 @@ __global__ void op_cuda_computeFluxes( int nblocks, int set_size) { + __shared__ int nelem, offset_b; extern __shared__ char shared[]; @@ -272,7 +305,8 @@ __global__ void op_cuda_computeFluxes( arg11+(n+offset_b)*1, arg12+(n+offset_b)*4, arg13+(n+offset_b)*3, - arg14+(n+offset_b)*1); + arg14+(n+offset_b)*1, + arg15); } } @@ -293,10 +327,12 @@ void op_par_loop_computeFluxes(char const *name, op_set set, op_arg arg11, op_arg arg12, op_arg arg13, - op_arg arg14){ + op_arg arg14, + op_arg arg15){ - int nargs = 15; - op_arg args[15]; + float*arg15h = (float *)arg15.data; + int nargs = 16; + op_arg args[16]; args[0] = arg0; args[1] = arg1; @@ -313,43 +349,57 @@ void op_par_loop_computeFluxes(char const *name, op_set set, args[12] = arg12; args[13] = arg13; args[14] = arg14; + args[15] = arg15; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; - op_timing_realloc(23); + op_timing_realloc(24); op_timers_core(&cpu_t1, &wall_t1); - OP_kernels[23].name = name; - OP_kernels[23].count += 1; + OP_kernels[24].name = name; + OP_kernels[24].count += 1; int ninds = 4; - int inds[15] = {0,0,1,1,-1,-1,2,2,-1,3,3,-1,-1,-1,-1}; + int inds[16] = {0,0,1,1,-1,-1,2,2,-1,3,3,-1,-1,-1,-1,-1}; if (OP_diags>2) { printf(" kernel routine with indirection: computeFluxes\n"); } //get plan - #ifdef OP_PART_SIZE_23 - int part_size = OP_PART_SIZE_23; + #ifdef OP_PART_SIZE_24 + int part_size = OP_PART_SIZE_24; #else int part_size = OP_part_size; #endif - int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args); - if (set->size > 0) { + int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2); + if (set_size > 0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); + //transfer constants to GPU + int consts_bytes = 0; + consts_bytes += ROUND_UP(1*sizeof(float)); + reallocConstArrays(consts_bytes); + consts_bytes = 0; + arg15.data = OP_consts_h + consts_bytes; + arg15.data_d = OP_consts_d + consts_bytes; + for ( int d=0; d<1; d++ ){ + ((float *)arg15.data)[d] = arg15h[d]; + } + consts_bytes += ROUND_UP(1*sizeof(float)); + mvConstArraysToDevice(consts_bytes); + //execute plan int block_offset = 0; for ( int col=0; colncolors; col++ ){ if (col==Plan->ncolors_core) { - op_mpi_wait_all_cuda(nargs, args); + op_mpi_wait_all_grouped(nargs, args, 2); } - #ifdef OP_BLOCK_SIZE_23 - int nthread = OP_BLOCK_SIZE_23; + #ifdef OP_BLOCK_SIZE_24 + int nthread = OP_BLOCK_SIZE_24; #else int nthread = OP_block_size; #endif @@ -370,6 +420,7 @@ void op_par_loop_computeFluxes(char const *name, op_set set, (float*)arg12.data_d, (float*)arg13.data_d, (float*)arg14.data_d, + (float*)arg15.data_d, block_offset, Plan->blkmap, Plan->offset, @@ -382,12 +433,14 @@ void op_par_loop_computeFluxes(char const *name, op_set set, } block_offset += Plan->ncolblk[col]; } - OP_kernels[23].transfer += Plan->transfer; - OP_kernels[23].transfer2 += Plan->transfer2; + OP_kernels[24].transfer += Plan->transfer; + OP_kernels[24].transfer2 += Plan->transfer2; } op_mpi_set_dirtybit_cuda(nargs, args); - cutilSafeCall(cudaDeviceSynchronize()); + if (OP_diags>1) { + cutilSafeCall(cudaDeviceSynchronize()); + } //update kernel record op_timers_core(&cpu_t2, &wall_t2); - OP_kernels[23].time += wall_t2 - wall_t1; + OP_kernels[24].time += wall_t2 - wall_t1; } diff --git a/sp/cuda/computeGradient_kernel.cu b/sp/cuda/computeGradient_kernel.cu index 98eef74..987edb9 100644 --- a/sp/cuda/computeGradient_kernel.cu +++ b/sp/cuda/computeGradient_kernel.cu @@ -13,8 +13,7 @@ __device__ void computeGradient_gpu( const float *center, const float *nb3Center, float *q, float *out) { - - if( (cellCenter[0] != nb3Center[0]) && (cellCenter[1] != nb3Center[1])){ + if(center[0]> EPS_cuda){ float total, Rhs[8]; float dh[3], dz[3],du[3], dv[3], weights[3]; float Gram[2][2], inverse[2][2], delta[3][2]; @@ -27,17 +26,24 @@ __device__ void computeGradient_gpu( const float *center, delta[1][0] = (nb2Center[0] - x); delta[1][1] = (nb2Center[1] - y); - delta[2][0] = (nb3Center[0] - x); - delta[2][1] = (nb3Center[1] - y); + if( (cellCenter[0] != nb3Center[0]) && (cellCenter[1] != nb3Center[1])){ + delta[2][0] = (nb3Center[0] - x); + delta[2][1] = (nb3Center[1] - y); + } else { + delta[2][0] = 0.5f*(delta[0][0] + delta[1][0]); + delta[2][1] = 0.5f*(delta[0][1] + delta[1][1]); + } weights[0] = sqrt(delta[0][0] * delta[0][0] + delta[0][1] * delta[0][1]); weights[1] = sqrt(delta[1][0] * delta[1][0] + delta[1][1] * delta[1][1]); weights[2] = sqrt(delta[2][0] * delta[2][0] + delta[2][1] * delta[2][1]); + total = weights[0] + weights[1] + weights[2]; + weights[0] = total/weights[0]; weights[1] = total/weights[1]; - weights[2] = total/ weights[2]; + weights[2] = total/weights[2]; delta[0][0] *= weights[0]; delta[0][1] *= weights[0]; @@ -105,7 +111,18 @@ __device__ void computeGradient_gpu( const float *center, Rhs[7] = (delta[0][1]*dz[0]) + (delta[1][1]*dz[1]) + (delta[2][1]*dz[2]); out[6] = (inverse[0][0] * Rhs[6]) + (inverse[0][1] * Rhs[7]); out[7] = (inverse[1][0] * Rhs[6]) + (inverse[1][1] * Rhs[7]); - }else { + + if((isnan(out[0])) || (isnan(out[1])) || (isnan(out[2])) || (isnan(out[3])) || (isnan(out[4])) || (isnan(out[5])) || (isnan(out[6])) || (isnan(out[7]))){ + out[0] = 0.0f; + out[1] = 0.0f; + out[2] = 0.0f; + out[3] = 0.0f; + out[4] = 0.0f; + out[5] = 0.0f; + out[6] = 0.0f; + out[7] = 0.0f; + } + } else { out[0] = 0.0f; out[1] = 0.0f; @@ -115,7 +132,7 @@ __device__ void computeGradient_gpu( const float *center, out[5] = 0.0f; out[6] = 0.0f; out[7] = 0.0f; - } + } @@ -148,65 +165,45 @@ __device__ void computeGradient_gpu( const float *center, q[7] = center[3] > neighbour1[3] ? center[3] : neighbour1[3]; q[7] = q[7] > neighbour2[3] ? q[7] : neighbour2[3]; q[7] = q[7] > neighbour3[3] ? q[7] : neighbour3[3]; + } // CUDA kernel function __global__ void op_cuda_computeGradient( const float *__restrict ind_arg0, const float *__restrict ind_arg1, + float *__restrict ind_arg2, const int *__restrict opDat1Map, const float *__restrict arg0, const float *__restrict arg4, float *arg8, float *arg9, - int block_offset, - int *blkmap, - int *offset, - int *nelems, - int *ncolors, - int *colors, - int nblocks, + int start, + int end, + int *col_reord, int set_size) { - - __shared__ int nelem, offset_b; - - extern __shared__ char shared[]; - - if (blockIdx.x+blockIdx.y*gridDim.x >= nblocks) { - return; - } - if (threadIdx.x==0) { - - //get sizes and shift pointers and direct-mapped data - - int blockId = blkmap[blockIdx.x + blockIdx.y*gridDim.x + block_offset]; - - nelem = nelems[blockId]; - offset_b = offset[blockId]; - - } - __syncthreads(); // make sure all of above completed - - for ( int n=threadIdx.x; n2) { printf(" kernel routine with indirection: computeGradient\n"); } //get plan - #ifdef OP_PART_SIZE_21 - int part_size = OP_PART_SIZE_21; + #ifdef OP_PART_SIZE_22 + int part_size = OP_PART_SIZE_22; #else int part_size = OP_part_size; #endif - int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args); - if (set->size > 0) { + int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2); + if (set_size > 0) { - op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); + op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2); //execute plan - - int block_offset = 0; for ( int col=0; colncolors; col++ ){ if (col==Plan->ncolors_core) { - op_mpi_wait_all_cuda(nargs, args); + op_mpi_wait_all_grouped(nargs, args, 2); } - #ifdef OP_BLOCK_SIZE_21 - int nthread = OP_BLOCK_SIZE_21; + #ifdef OP_BLOCK_SIZE_22 + int nthread = OP_BLOCK_SIZE_22; #else int nthread = OP_block_size; #endif - dim3 nblocks = dim3(Plan->ncolblk[col] >= (1<<16) ? 65535 : Plan->ncolblk[col], - Plan->ncolblk[col] >= (1<<16) ? (Plan->ncolblk[col]-1)/65535+1: 1, 1); - if (Plan->ncolblk[col] > 0) { - op_cuda_computeGradient<<>>( - (float *)arg1.data_d, - (float *)arg5.data_d, - arg1.map_data_d, - (float*)arg0.data_d, - (float*)arg4.data_d, - (float*)arg8.data_d, - (float*)arg9.data_d, - block_offset, - Plan->blkmap, - Plan->offset, - Plan->nelems, - Plan->nthrcol, - Plan->thrcol, - Plan->ncolblk[col], - set->size+set->exec_size); + int start = Plan->col_offsets[0][col]; + int end = Plan->col_offsets[0][col+1]; + int nblocks = (end - start - 1)/nthread + 1; + op_cuda_computeGradient<<>>( + (float *)arg1.data_d, + (float *)arg5.data_d, + (float *)arg7.data_d, + arg1.map_data_d, + (float*)arg0.data_d, + (float*)arg4.data_d, + (float*)arg8.data_d, + (float*)arg9.data_d, + start, + end, + Plan->col_reord, + set->size+set->exec_size); - } - block_offset += Plan->ncolblk[col]; } - OP_kernels[21].transfer += Plan->transfer; - OP_kernels[21].transfer2 += Plan->transfer2; + OP_kernels[22].transfer += Plan->transfer; + OP_kernels[22].transfer2 += Plan->transfer2; } op_mpi_set_dirtybit_cuda(nargs, args); - cutilSafeCall(cudaDeviceSynchronize()); + if (OP_diags>1) { + cutilSafeCall(cudaDeviceSynchronize()); + } //update kernel record op_timers_core(&cpu_t2, &wall_t2); - OP_kernels[21].time += wall_t2 - wall_t1; + OP_kernels[22].time += wall_t2 - wall_t1; } diff --git a/sp/cuda/gatherLocations_kernel.cu b/sp/cuda/gatherLocations_kernel.cu index adf6ea9..e66ed62 100644 --- a/sp/cuda/gatherLocations_kernel.cu +++ b/sp/cuda/gatherLocations_kernel.cu @@ -3,19 +3,21 @@ // //user function -__device__ void gatherLocations_gpu( const float *values, float *dest) { - dest[0] = values[0] + values[3]; +__device__ void gatherLocations_gpu( const float *values, const float *zmin, float *dest) { + dest[0] = values[0] + (values[3] - values[0] + *zmin); dest[1] = values[0]; - dest[2] = values[1]; - dest[3] = values[2]; + dest[2] = values[1]/values[0]; + dest[3] = values[2]/values[0]; dest[4] = values[3]; + } // CUDA kernel function __global__ void op_cuda_gatherLocations( const float *__restrict ind_arg0, const int *__restrict opDat0Map, - float *arg1, + const float *arg1, + float *arg2, int block_offset, int *blkmap, int *offset, @@ -25,6 +27,7 @@ __global__ void op_cuda_gatherLocations( int nblocks, int set_size) { + __shared__ int nelem, offset_b; extern __shared__ char shared[]; @@ -51,7 +54,8 @@ __global__ void op_cuda_gatherLocations( //user-supplied kernel call gatherLocations_gpu(ind_arg0+map0idx*4, - arg1+(n+offset_b)*5); + arg1, + arg2+(n+offset_b)*5); } } @@ -59,13 +63,16 @@ __global__ void op_cuda_gatherLocations( //host stub function void op_par_loop_gatherLocations(char const *name, op_set set, op_arg arg0, - op_arg arg1){ + op_arg arg1, + op_arg arg2){ - int nargs = 2; - op_arg args[2]; + float*arg1h = (float *)arg1.data; + int nargs = 3; + op_arg args[3]; args[0] = arg0; args[1] = arg1; + args[2] = arg2; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; @@ -76,7 +83,7 @@ void op_par_loop_gatherLocations(char const *name, op_set set, int ninds = 1; - int inds[2] = {0,-1}; + int inds[3] = {0,-1,-1}; if (OP_diags>2) { printf(" kernel routine with indirection: gatherLocations\n"); @@ -89,17 +96,30 @@ void op_par_loop_gatherLocations(char const *name, op_set set, int part_size = OP_part_size; #endif - int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args); - if (set->size > 0) { + int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2); + if (set_size > 0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); + //transfer constants to GPU + int consts_bytes = 0; + consts_bytes += ROUND_UP(1*sizeof(float)); + reallocConstArrays(consts_bytes); + consts_bytes = 0; + arg1.data = OP_consts_h + consts_bytes; + arg1.data_d = OP_consts_d + consts_bytes; + for ( int d=0; d<1; d++ ){ + ((float *)arg1.data)[d] = arg1h[d]; + } + consts_bytes += ROUND_UP(1*sizeof(float)); + mvConstArraysToDevice(consts_bytes); + //execute plan int block_offset = 0; for ( int col=0; colncolors; col++ ){ if (col==Plan->ncolors_core) { - op_mpi_wait_all_cuda(nargs, args); + op_mpi_wait_all_grouped(nargs, args, 2); } #ifdef OP_BLOCK_SIZE_20 int nthread = OP_BLOCK_SIZE_20; @@ -114,6 +134,7 @@ void op_par_loop_gatherLocations(char const *name, op_set set, (float *)arg0.data_d, arg0.map_data_d, (float*)arg1.data_d, + (float*)arg2.data_d, block_offset, Plan->blkmap, Plan->offset, @@ -130,7 +151,9 @@ void op_par_loop_gatherLocations(char const *name, op_set set, OP_kernels[20].transfer2 += Plan->transfer2; } op_mpi_set_dirtybit_cuda(nargs, args); - cutilSafeCall(cudaDeviceSynchronize()); + if (OP_diags>1) { + cutilSafeCall(cudaDeviceSynchronize()); + } //update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[20].time += wall_t2 - wall_t1; diff --git a/sp/cuda/getMaxElevation_kernel.cu b/sp/cuda/getMaxElevation_kernel.cu index 11f3d8e..4856dc5 100644 --- a/sp/cuda/getMaxElevation_kernel.cu +++ b/sp/cuda/getMaxElevation_kernel.cu @@ -4,13 +4,23 @@ //user function __device__ void getMaxElevation_gpu( const float* values, float* currentMaxElevation) { + float tmp = values[0]+values[3]; + float tmpmax = currentMaxElevation[0]+currentMaxElevation[3]; - if (values[0] > currentMaxElevation[0]) { - currentMaxElevation[0] = values[0]; - currentMaxElevation[1] = values[1]; - currentMaxElevation[2] = values[2]; - currentMaxElevation[3] = values[3]; + if (tmp > tmpmax ) { + + currentMaxElevation[0] = values[0]; + currentMaxElevation[1] = values[1]; + currentMaxElevation[2] = values[2]; + currentMaxElevation[3] = values[3]; } + + + + + + + } // CUDA kernel function @@ -53,15 +63,14 @@ void op_par_loop_getMaxElevation(char const *name, op_set set, printf(" kernel routine w/o indirection: getMaxElevation"); } - op_mpi_halo_exchanges_cuda(set, nargs, args); - if (set->size > 0) { + int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2); + if (set_size > 0) { //set CUDA execution parameters #ifdef OP_BLOCK_SIZE_18 int nthread = OP_BLOCK_SIZE_18; #else int nthread = OP_block_size; - // int nthread = 128; #endif int nblocks = 200; @@ -72,7 +81,9 @@ void op_par_loop_getMaxElevation(char const *name, op_set set, set->size ); } op_mpi_set_dirtybit_cuda(nargs, args); - cutilSafeCall(cudaDeviceSynchronize()); + if (OP_diags>1) { + cutilSafeCall(cudaDeviceSynchronize()); + } //update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[18].time += wall_t2 - wall_t1; diff --git a/sp/cuda/getMaxSpeed_kernel.cu b/sp/cuda/getMaxSpeed_kernel.cu index 2cf7676..adb78e8 100644 --- a/sp/cuda/getMaxSpeed_kernel.cu +++ b/sp/cuda/getMaxSpeed_kernel.cu @@ -5,12 +5,20 @@ //user function __device__ void getMaxSpeed_gpu( const float* values, float* currentMaxSpeed) { - if (sqrt(values[1]*values[1]+values[2]*values[2]) > sqrt(currentMaxSpeed[1]*currentMaxSpeed[1]+currentMaxSpeed[2]*currentMaxSpeed[2])) { - currentMaxSpeed[0] = values[0]; - currentMaxSpeed[1] = values[1]; - currentMaxSpeed[2] = values[2]; - currentMaxSpeed[3] = values[3]; + if (values[0] > 1e-3){ + float TruncatedH = values[0]; + float u = values[1]/TruncatedH; + float v = values[2]/TruncatedH; + float umax = currentMaxSpeed[1]/currentMaxSpeed[0]; + float vmax = currentMaxSpeed[2]/currentMaxSpeed[0]; + if (sqrt(u*u + v*v) > sqrt(umax*umax+vmax*vmax)) { + currentMaxSpeed[0] = values[0]; + currentMaxSpeed[1] = values[1]; + currentMaxSpeed[2] = values[2]; + currentMaxSpeed[3] = values[3]; + } } + } // CUDA kernel function @@ -53,15 +61,14 @@ void op_par_loop_getMaxSpeed(char const *name, op_set set, printf(" kernel routine w/o indirection: getMaxSpeed"); } - op_mpi_halo_exchanges_cuda(set, nargs, args); - if (set->size > 0) { + int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2); + if (set_size > 0) { //set CUDA execution parameters #ifdef OP_BLOCK_SIZE_19 int nthread = OP_BLOCK_SIZE_19; #else int nthread = OP_block_size; - // int nthread = 128; #endif int nblocks = 200; @@ -72,7 +79,9 @@ void op_par_loop_getMaxSpeed(char const *name, op_set set, set->size ); } op_mpi_set_dirtybit_cuda(nargs, args); - cutilSafeCall(cudaDeviceSynchronize()); + if (OP_diags>1) { + cutilSafeCall(cudaDeviceSynchronize()); + } //update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[19].time += wall_t2 - wall_t1; diff --git a/sp/cuda/getTotalVol_kernel.cu b/sp/cuda/getTotalVol_kernel.cu index 596761d..20ca771 100644 --- a/sp/cuda/getTotalVol_kernel.cu +++ b/sp/cuda/getTotalVol_kernel.cu @@ -5,6 +5,7 @@ //user function __device__ void getTotalVol_gpu( const float* cellVolume, const float* value, float* totalVol) { (*totalVol) += (*cellVolume) * value[0]; + } // CUDA kernel function @@ -62,15 +63,14 @@ void op_par_loop_getTotalVol(char const *name, op_set set, printf(" kernel routine w/o indirection: getTotalVol"); } - op_mpi_halo_exchanges_cuda(set, nargs, args); - if (set->size > 0) { + int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2); + if (set_size > 0) { //set CUDA execution parameters #ifdef OP_BLOCK_SIZE_17 int nthread = OP_BLOCK_SIZE_17; #else int nthread = OP_block_size; - // int nthread = 128; #endif int nblocks = 200; @@ -110,7 +110,9 @@ void op_par_loop_getTotalVol(char const *name, op_set set, op_mpi_reduce(&arg2,arg2h); } op_mpi_set_dirtybit_cuda(nargs, args); - cutilSafeCall(cudaDeviceSynchronize()); + if (OP_diags>1) { + cutilSafeCall(cudaDeviceSynchronize()); + } //update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[17].time += wall_t2 - wall_t1; diff --git a/sp/cuda/incConst_kernel.cu b/sp/cuda/incConst_kernel.cu index 21dfe37..9344b64 100644 --- a/sp/cuda/incConst_kernel.cu +++ b/sp/cuda/incConst_kernel.cu @@ -4,18 +4,23 @@ //user function __device__ void incConst_gpu( const float *in, float *out, const int *variables) { + float H; if (*variables & 1) { out[0] += *in; + out[3] += *in; } if (*variables & 2) { - out[1] += *in; + H = out[0] > EPS_cuda ? out[0] : EPS_cuda; + out[1] += *in * H; } if (*variables & 4) { - out[2] += *in; + H = out[0] > EPS_cuda ? out[0] : EPS_cuda; + out[2] += *in * H; } if (*variables & 8) { out[3] += *in; } + } // CUDA kernel function @@ -53,18 +58,18 @@ void op_par_loop_incConst(char const *name, op_set set, // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; - op_timing_realloc(5); + op_timing_realloc(4); op_timers_core(&cpu_t1, &wall_t1); - OP_kernels[5].name = name; - OP_kernels[5].count += 1; + OP_kernels[4].name = name; + OP_kernels[4].count += 1; if (OP_diags>2) { printf(" kernel routine w/o indirection: incConst"); } - op_mpi_halo_exchanges_cuda(set, nargs, args); - if (set->size > 0) { + int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2); + if (set_size > 0) { //transfer constants to GPU int consts_bytes = 0; @@ -80,11 +85,10 @@ void op_par_loop_incConst(char const *name, op_set set, mvConstArraysToDevice(consts_bytes); //set CUDA execution parameters - #ifdef OP_BLOCK_SIZE_5 - int nthread = OP_BLOCK_SIZE_5; + #ifdef OP_BLOCK_SIZE_4 + int nthread = OP_BLOCK_SIZE_4; #else int nthread = OP_block_size; - // int nthread = 128; #endif int nblocks = 200; @@ -96,10 +100,12 @@ void op_par_loop_incConst(char const *name, op_set set, set->size ); } op_mpi_set_dirtybit_cuda(nargs, args); - cutilSafeCall(cudaDeviceSynchronize()); + if (OP_diags>1) { + cutilSafeCall(cudaDeviceSynchronize()); + } //update kernel record op_timers_core(&cpu_t2, &wall_t2); - OP_kernels[5].time += wall_t2 - wall_t1; - OP_kernels[5].transfer += (float)set->size * arg0.size; - OP_kernels[5].transfer += (float)set->size * arg1.size * 2.0f; + OP_kernels[4].time += wall_t2 - wall_t1; + OP_kernels[4].transfer += (float)set->size * arg0.size; + OP_kernels[4].transfer += (float)set->size * arg1.size * 2.0f; } diff --git a/sp/cuda/initBathyRelative_formula_kernel.cu b/sp/cuda/initBathyRelative_formula_kernel.cu index c986f32..96faa0c 100644 --- a/sp/cuda/initBathyRelative_formula_kernel.cu +++ b/sp/cuda/initBathyRelative_formula_kernel.cu @@ -7,8 +7,9 @@ __device__ void initBathyRelative_formula_gpu( const float *coords, float *value float x = coords[0]; float y = coords[1]; float t = *time; - float val = exp(-(2.f*sqrt(x*0.01f*0.01f/(tan((5.7f*2.f*M_PI)/360.f)))-sqrt(g)*0.01f*t)*(2.f*sqrt(x*0.01f*0.01f/(tan((5.7f*2.f*M_PI)/360.f)))-sqrt(g)*0.01f*t));; - values[3] = *bathy0 + val; + float val = exp(-(2.f*sqrt(x*0.01f*0.01f/(tan((5.7f*2.f*M_PI)/360.f)))-sqrt(g_cuda)*0.01f*t)*(2.f*sqrt(x*0.01f*0.01f/(tan((5.7f*2.f*M_PI)/360.f)))-sqrt(g_cuda)*0.01f*t));; + *values = *bathy0 + val; + } // CUDA kernel function @@ -25,7 +26,7 @@ __global__ void op_cuda_initBathyRelative_formula( //user-supplied kernel call initBathyRelative_formula_gpu(arg0+n*2, - arg1+n*4, + arg1+n*1, arg2+n*1, arg3); } @@ -50,18 +51,18 @@ void op_par_loop_initBathyRelative_formula(char const *name, op_set set, // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; - op_timing_realloc(12); + op_timing_realloc(10); op_timers_core(&cpu_t1, &wall_t1); - OP_kernels[12].name = name; - OP_kernels[12].count += 1; + OP_kernels[10].name = name; + OP_kernels[10].count += 1; if (OP_diags>2) { printf(" kernel routine w/o indirection: initBathyRelative_formula"); } - op_mpi_halo_exchanges_cuda(set, nargs, args); - if (set->size > 0) { + int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2); + if (set_size > 0) { //transfer constants to GPU int consts_bytes = 0; @@ -77,11 +78,10 @@ void op_par_loop_initBathyRelative_formula(char const *name, op_set set, mvConstArraysToDevice(consts_bytes); //set CUDA execution parameters - #ifdef OP_BLOCK_SIZE_12 - int nthread = OP_BLOCK_SIZE_12; + #ifdef OP_BLOCK_SIZE_10 + int nthread = OP_BLOCK_SIZE_10; #else int nthread = OP_block_size; - // int nthread = 128; #endif int nblocks = 200; @@ -94,11 +94,13 @@ void op_par_loop_initBathyRelative_formula(char const *name, op_set set, set->size ); } op_mpi_set_dirtybit_cuda(nargs, args); - cutilSafeCall(cudaDeviceSynchronize()); + if (OP_diags>1) { + cutilSafeCall(cudaDeviceSynchronize()); + } //update kernel record op_timers_core(&cpu_t2, &wall_t2); - OP_kernels[12].time += wall_t2 - wall_t1; - OP_kernels[12].transfer += (float)set->size * arg0.size; - OP_kernels[12].transfer += (float)set->size * arg1.size * 2.0f; - OP_kernels[12].transfer += (float)set->size * arg2.size; + OP_kernels[10].time += wall_t2 - wall_t1; + OP_kernels[10].transfer += (float)set->size * arg0.size; + OP_kernels[10].transfer += (float)set->size * arg1.size * 2.0f; + OP_kernels[10].transfer += (float)set->size * arg2.size; } diff --git a/sp/cuda/initBathymetry_formula_kernel.cu b/sp/cuda/initBathymetry_formula_kernel.cu index ba2de25..b9f6c31 100644 --- a/sp/cuda/initBathymetry_formula_kernel.cu +++ b/sp/cuda/initBathymetry_formula_kernel.cu @@ -7,8 +7,9 @@ __device__ void initBathymetry_formula_gpu( const float *coords, float *values, float x = coords[0]; float y = coords[1]; float t = *time; - float val = .2f*(-5.0f-x)*(x<0.0f)-(x>=0.0f)+.2f*(t<1.0f)*exp(-(x+3.0f-2.0f*t)*(x+3.0f-2.0f*t)-y*y)+.2f*(t>=1.0f)*exp(-(x+1.0f)*(x+1.0f)-y*y);; - values[3] = val; + float val = -1.0f ;; + *values = val; + } // CUDA kernel function @@ -24,7 +25,7 @@ __global__ void op_cuda_initBathymetry_formula( //user-supplied kernel call initBathymetry_formula_gpu(arg0+n*2, - arg1+n*4, + arg1+n*1, arg2); } } @@ -46,18 +47,18 @@ void op_par_loop_initBathymetry_formula(char const *name, op_set set, // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; - op_timing_realloc(13); + op_timing_realloc(11); op_timers_core(&cpu_t1, &wall_t1); - OP_kernels[13].name = name; - OP_kernels[13].count += 1; + OP_kernels[11].name = name; + OP_kernels[11].count += 1; if (OP_diags>2) { printf(" kernel routine w/o indirection: initBathymetry_formula"); } - op_mpi_halo_exchanges_cuda(set, nargs, args); - if (set->size > 0) { + int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2); + if (set_size > 0) { //transfer constants to GPU int consts_bytes = 0; @@ -73,11 +74,10 @@ void op_par_loop_initBathymetry_formula(char const *name, op_set set, mvConstArraysToDevice(consts_bytes); //set CUDA execution parameters - #ifdef OP_BLOCK_SIZE_13 - int nthread = OP_BLOCK_SIZE_13; + #ifdef OP_BLOCK_SIZE_11 + int nthread = OP_BLOCK_SIZE_11; #else int nthread = OP_block_size; - // int nthread = 128; #endif int nblocks = 200; @@ -89,10 +89,12 @@ void op_par_loop_initBathymetry_formula(char const *name, op_set set, set->size ); } op_mpi_set_dirtybit_cuda(nargs, args); - cutilSafeCall(cudaDeviceSynchronize()); + if (OP_diags>1) { + cutilSafeCall(cudaDeviceSynchronize()); + } //update kernel record op_timers_core(&cpu_t2, &wall_t2); - OP_kernels[13].time += wall_t2 - wall_t1; - OP_kernels[13].transfer += (float)set->size * arg0.size; - OP_kernels[13].transfer += (float)set->size * arg1.size * 2.0f; + OP_kernels[11].time += wall_t2 - wall_t1; + OP_kernels[11].transfer += (float)set->size * arg0.size; + OP_kernels[11].transfer += (float)set->size * arg1.size * 2.0f; } diff --git a/sp/cuda/initBathymetry_large_kernel.cu b/sp/cuda/initBathymetry_large_kernel.cu index 0973112..fcf9925 100644 --- a/sp/cuda/initBathymetry_large_kernel.cu +++ b/sp/cuda/initBathymetry_large_kernel.cu @@ -45,8 +45,9 @@ __device__ void initBathymetry_large_gpu( float *values, const float *cellCenter float b = -(node1[0]-node0[0])*(*bathy2-*bathy0)+(node2[0]-node0[0])*(*bathy1-*bathy0); float c = (node1[0]-node0[0])*(node2[1]-node0[1])-(node2[0]-node0[0])*(node1[1]-node0[1]); - values[3] += *bathy0 - (a*(cellCenter[0]-node0[0]) + b*(cellCenter[1]-node0[1]))/c; + *values += *bathy0 - (a*(cellCenter[0]-node0[0]) + b*(cellCenter[1]-node0[1]))/c; } + } // CUDA kernel function @@ -57,6 +58,10 @@ __global__ void op_cuda_initBathymetry_large( const float *__restrict ind_arg3, const int *__restrict opDat0Map, const int *__restrict opDat2Map, + int *ind_map, + short *arg_map, + int *ind_arg_sizes, + int *ind_arg_offs, int block_offset, int *blkmap, int *offset, @@ -65,7 +70,10 @@ __global__ void op_cuda_initBathymetry_large( int *colors, int nblocks, int set_size) { - float arg0_l[4]; + float arg0_l[1]; + + __shared__ int *ind_arg0_map, ind_arg0_size; + __shared__ float *ind_arg0_s; __shared__ int nelems2, ncolor; __shared__ int nelem, offset_b; @@ -87,9 +95,22 @@ __global__ void op_cuda_initBathymetry_large( nelems2 = blockDim.x*(1+(nelem-1)/blockDim.x); ncolor = ncolors[blockId]; + ind_arg0_size = ind_arg_sizes[0+blockId*1]; + + ind_arg0_map = &ind_map[0*set_size] + ind_arg_offs[0+blockId*1]; + + //set shared memory pointers + int nbytes = 0; + ind_arg0_s = (float *) &shared[nbytes]; } __syncthreads(); // make sure all of above completed + for ( int n=threadIdx.x; n=0) { + arg0_map = arg_map[0*set_size+n+offset_b]; + } + for ( int col=0; colsize > 0) { + int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2); + if (set_size > 0) { - op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); + op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_STAGE_INC); //execute plan int block_offset = 0; for ( int col=0; colncolors; col++ ){ if (col==Plan->ncolors_core) { - op_mpi_wait_all_cuda(nargs, args); + op_mpi_wait_all_grouped(nargs, args, 2); } - #ifdef OP_BLOCK_SIZE_11 - int nthread = OP_BLOCK_SIZE_11; + #ifdef OP_BLOCK_SIZE_9 + int nthread = OP_BLOCK_SIZE_9; #else int nthread = OP_block_size; #endif @@ -204,13 +227,18 @@ void op_par_loop_initBathymetry_large(char const *name, op_set set, dim3 nblocks = dim3(Plan->ncolblk[col] >= (1<<16) ? 65535 : Plan->ncolblk[col], Plan->ncolblk[col] >= (1<<16) ? (Plan->ncolblk[col]-1)/65535+1: 1, 1); if (Plan->ncolblk[col] > 0) { - op_cuda_initBathymetry_large<<>>( + int nshared = Plan->nsharedCol[col]; + op_cuda_initBathymetry_large<<>>( (float *)arg0.data_d, (float *)arg1.data_d, (float *)arg2.data_d, (float *)arg5.data_d, arg0.map_data_d, arg2.map_data_d, + Plan->ind_map, + Plan->loc_map, + Plan->ind_sizes, + Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, @@ -223,12 +251,14 @@ void op_par_loop_initBathymetry_large(char const *name, op_set set, } block_offset += Plan->ncolblk[col]; } - OP_kernels[11].transfer += Plan->transfer; - OP_kernels[11].transfer2 += Plan->transfer2; + OP_kernels[9].transfer += Plan->transfer; + OP_kernels[9].transfer2 += Plan->transfer2; } op_mpi_set_dirtybit_cuda(nargs, args); - cutilSafeCall(cudaDeviceSynchronize()); + if (OP_diags>1) { + cutilSafeCall(cudaDeviceSynchronize()); + } //update kernel record op_timers_core(&cpu_t2, &wall_t2); - OP_kernels[11].time += wall_t2 - wall_t1; + OP_kernels[9].time += wall_t2 - wall_t1; } diff --git a/sp/cuda/initBathymetry_update_kernel.cu b/sp/cuda/initBathymetry_update_kernel.cu index e6fa3b5..48f56e8 100644 --- a/sp/cuda/initBathymetry_update_kernel.cu +++ b/sp/cuda/initBathymetry_update_kernel.cu @@ -3,17 +3,31 @@ // //user function -__device__ void initBathymetry_update_gpu( float *values, const int *firstTime) { - if (*firstTime) - values[0] -= values[3]; +__device__ void initBathymetry_update_gpu( float *values, const float *z_zero, const float *zmin, const int *firstTime) { + if (*firstTime){ + if (*z_zero > 0.0f){ + values[0] = EPS_cuda; + values[3] = -1.0f* *zmin + *z_zero; + } else { + values[0] = -1.0f* *z_zero; + values[3] = -1.0f* *zmin; + } + } else { + if (*z_zero > 0.0f){ + values[3] = -1.0f* *zmin + *z_zero + values[0]; + } else { + values[3] = values[0] + *z_zero - *zmin; + } + } - values[0] = values[0] < EPS ? EPS : values[0]; } // CUDA kernel function __global__ void op_cuda_initBathymetry_update( float *arg0, - const int *arg1, + const float *__restrict arg1, + const float *arg2, + const int *arg3, int set_size ) { @@ -22,7 +36,9 @@ __global__ void op_cuda_initBathymetry_update( //user-supplied kernel call initBathymetry_update_gpu(arg0+n*4, - arg1); + arg1+n*1, + arg2, + arg3); } } @@ -30,62 +46,78 @@ __global__ void op_cuda_initBathymetry_update( //host stub function void op_par_loop_initBathymetry_update(char const *name, op_set set, op_arg arg0, - op_arg arg1){ + op_arg arg1, + op_arg arg2, + op_arg arg3){ - int*arg1h = (int *)arg1.data; - int nargs = 2; - op_arg args[2]; + float*arg2h = (float *)arg2.data; + int*arg3h = (int *)arg3.data; + int nargs = 4; + op_arg args[4]; args[0] = arg0; args[1] = arg1; + args[2] = arg2; + args[3] = arg3; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; - op_timing_realloc(14); + op_timing_realloc(13); op_timers_core(&cpu_t1, &wall_t1); - OP_kernels[14].name = name; - OP_kernels[14].count += 1; + OP_kernels[13].name = name; + OP_kernels[13].count += 1; if (OP_diags>2) { printf(" kernel routine w/o indirection: initBathymetry_update"); } - op_mpi_halo_exchanges_cuda(set, nargs, args); - if (set->size > 0) { + int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2); + if (set_size > 0) { //transfer constants to GPU int consts_bytes = 0; + consts_bytes += ROUND_UP(1*sizeof(float)); consts_bytes += ROUND_UP(1*sizeof(int)); reallocConstArrays(consts_bytes); consts_bytes = 0; - arg1.data = OP_consts_h + consts_bytes; - arg1.data_d = OP_consts_d + consts_bytes; + arg2.data = OP_consts_h + consts_bytes; + arg2.data_d = OP_consts_d + consts_bytes; + for ( int d=0; d<1; d++ ){ + ((float *)arg2.data)[d] = arg2h[d]; + } + consts_bytes += ROUND_UP(1*sizeof(float)); + arg3.data = OP_consts_h + consts_bytes; + arg3.data_d = OP_consts_d + consts_bytes; for ( int d=0; d<1; d++ ){ - ((int *)arg1.data)[d] = arg1h[d]; + ((int *)arg3.data)[d] = arg3h[d]; } consts_bytes += ROUND_UP(1*sizeof(int)); mvConstArraysToDevice(consts_bytes); //set CUDA execution parameters - #ifdef OP_BLOCK_SIZE_14 - int nthread = OP_BLOCK_SIZE_14; + #ifdef OP_BLOCK_SIZE_13 + int nthread = OP_BLOCK_SIZE_13; #else int nthread = OP_block_size; - // int nthread = 128; #endif int nblocks = 200; op_cuda_initBathymetry_update<<>>( (float *) arg0.data_d, - (int *) arg1.data_d, + (float *) arg1.data_d, + (float *) arg2.data_d, + (int *) arg3.data_d, set->size ); } op_mpi_set_dirtybit_cuda(nargs, args); - cutilSafeCall(cudaDeviceSynchronize()); + if (OP_diags>1) { + cutilSafeCall(cudaDeviceSynchronize()); + } //update kernel record op_timers_core(&cpu_t2, &wall_t2); - OP_kernels[14].time += wall_t2 - wall_t1; - OP_kernels[14].transfer += (float)set->size * arg0.size * 2.0f; + OP_kernels[13].time += wall_t2 - wall_t1; + OP_kernels[13].transfer += (float)set->size * arg0.size * 2.0f; + OP_kernels[13].transfer += (float)set->size * arg1.size; } diff --git a/sp/cuda/initBore_select_kernel.cu b/sp/cuda/initBore_select_kernel.cu index c47cb90..6e516a6 100644 --- a/sp/cuda/initBore_select_kernel.cu +++ b/sp/cuda/initBore_select_kernel.cu @@ -14,6 +14,7 @@ __device__ void initBore_select_gpu( float *values, const float *center, values[0] = center[0] < *x0 ? *Hl : *Hr; values[1] = center[0] < *x0 ? *ul : *ur; values[2] = center[0] < *x0 ? *vl : *vr; + } // CUDA kernel function @@ -81,18 +82,18 @@ void op_par_loop_initBore_select(char const *name, op_set set, // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; - op_timing_realloc(15); + op_timing_realloc(14); op_timers_core(&cpu_t1, &wall_t1); - OP_kernels[15].name = name; - OP_kernels[15].count += 1; + OP_kernels[14].name = name; + OP_kernels[14].count += 1; if (OP_diags>2) { printf(" kernel routine w/o indirection: initBore_select"); } - op_mpi_halo_exchanges_cuda(set, nargs, args); - if (set->size > 0) { + int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2); + if (set_size > 0) { //transfer constants to GPU int consts_bytes = 0; @@ -150,11 +151,10 @@ void op_par_loop_initBore_select(char const *name, op_set set, mvConstArraysToDevice(consts_bytes); //set CUDA execution parameters - #ifdef OP_BLOCK_SIZE_15 - int nthread = OP_BLOCK_SIZE_15; + #ifdef OP_BLOCK_SIZE_14 + int nthread = OP_BLOCK_SIZE_14; #else int nthread = OP_block_size; - // int nthread = 128; #endif int nblocks = 200; @@ -172,10 +172,12 @@ void op_par_loop_initBore_select(char const *name, op_set set, set->size ); } op_mpi_set_dirtybit_cuda(nargs, args); - cutilSafeCall(cudaDeviceSynchronize()); + if (OP_diags>1) { + cutilSafeCall(cudaDeviceSynchronize()); + } //update kernel record op_timers_core(&cpu_t2, &wall_t2); - OP_kernels[15].time += wall_t2 - wall_t1; - OP_kernels[15].transfer += (float)set->size * arg0.size * 2.0f; - OP_kernels[15].transfer += (float)set->size * arg1.size; + OP_kernels[14].time += wall_t2 - wall_t1; + OP_kernels[14].transfer += (float)set->size * arg0.size * 2.0f; + OP_kernels[14].transfer += (float)set->size * arg1.size; } diff --git a/sp/cuda/initEta_formula_kernel.cu b/sp/cuda/initEta_formula_kernel.cu index 2edbc43..5e3c07e 100644 --- a/sp/cuda/initEta_formula_kernel.cu +++ b/sp/cuda/initEta_formula_kernel.cu @@ -7,9 +7,9 @@ __device__ void initEta_formula_gpu( const float *coords, float *values, const d float x = coords[0]; float y = coords[1]; float t = *time; - float val = 0.0f; - + float val = -1.0f ;; values[0] += val; + } // CUDA kernel function @@ -47,18 +47,18 @@ void op_par_loop_initEta_formula(char const *name, op_set set, // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; - op_timing_realloc(6); + op_timing_realloc(5); op_timers_core(&cpu_t1, &wall_t1); - OP_kernels[6].name = name; - OP_kernels[6].count += 1; + OP_kernels[5].name = name; + OP_kernels[5].count += 1; if (OP_diags>2) { printf(" kernel routine w/o indirection: initEta_formula"); } - op_mpi_halo_exchanges_cuda(set, nargs, args); - if (set->size > 0) { + int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2); + if (set_size > 0) { //transfer constants to GPU int consts_bytes = 0; @@ -74,11 +74,10 @@ void op_par_loop_initEta_formula(char const *name, op_set set, mvConstArraysToDevice(consts_bytes); //set CUDA execution parameters - #ifdef OP_BLOCK_SIZE_6 - int nthread = OP_BLOCK_SIZE_6; + #ifdef OP_BLOCK_SIZE_5 + int nthread = OP_BLOCK_SIZE_5; #else int nthread = OP_block_size; - // int nthread = 128; #endif int nblocks = 200; @@ -90,10 +89,12 @@ void op_par_loop_initEta_formula(char const *name, op_set set, set->size ); } op_mpi_set_dirtybit_cuda(nargs, args); - cutilSafeCall(cudaDeviceSynchronize()); + if (OP_diags>1) { + cutilSafeCall(cudaDeviceSynchronize()); + } //update kernel record op_timers_core(&cpu_t2, &wall_t2); - OP_kernels[6].time += wall_t2 - wall_t1; - OP_kernels[6].transfer += (float)set->size * arg0.size; - OP_kernels[6].transfer += (float)set->size * arg1.size * 2.0f; + OP_kernels[5].time += wall_t2 - wall_t1; + OP_kernels[5].transfer += (float)set->size * arg0.size; + OP_kernels[5].transfer += (float)set->size * arg1.size * 2.0f; } diff --git a/sp/cuda/initGaussianLandslide_kernel.cu b/sp/cuda/initGaussianLandslide_kernel.cu index f801422..1401fdc 100644 --- a/sp/cuda/initGaussianLandslide_kernel.cu +++ b/sp/cuda/initGaussianLandslide_kernel.cu @@ -9,6 +9,7 @@ __device__ void initGaussianLandslide_gpu( const float *center, float *values, c values[3] = (*mesh_xmin-x)*(x<0.0)-5.0*(x>=0.0)+ *A*(*t<1.0/(*v))*exp(-1.0* *lx* *lx*(x+3.0-*v**t)*(x+3.0-*v**t)-*ly**ly*y*y) +*A*(*t>=1.0/(*v))*exp(-*lx*(x+3.0-1.0)**lx*(x+3.0-1.0)-*ly**ly*y*y); + } // CUDA kernel function @@ -71,18 +72,18 @@ void op_par_loop_initGaussianLandslide(char const *name, op_set set, // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; - op_timing_realloc(16); + op_timing_realloc(15); op_timers_core(&cpu_t1, &wall_t1); - OP_kernels[16].name = name; - OP_kernels[16].count += 1; + OP_kernels[15].name = name; + OP_kernels[15].count += 1; if (OP_diags>2) { printf(" kernel routine w/o indirection: initGaussianLandslide"); } - op_mpi_halo_exchanges_cuda(set, nargs, args); - if (set->size > 0) { + int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2); + if (set_size > 0) { //transfer constants to GPU int consts_bytes = 0; @@ -133,11 +134,10 @@ void op_par_loop_initGaussianLandslide(char const *name, op_set set, mvConstArraysToDevice(consts_bytes); //set CUDA execution parameters - #ifdef OP_BLOCK_SIZE_16 - int nthread = OP_BLOCK_SIZE_16; + #ifdef OP_BLOCK_SIZE_15 + int nthread = OP_BLOCK_SIZE_15; #else int nthread = OP_block_size; - // int nthread = 128; #endif int nblocks = 200; @@ -154,10 +154,12 @@ void op_par_loop_initGaussianLandslide(char const *name, op_set set, set->size ); } op_mpi_set_dirtybit_cuda(nargs, args); - cutilSafeCall(cudaDeviceSynchronize()); + if (OP_diags>1) { + cutilSafeCall(cudaDeviceSynchronize()); + } //update kernel record op_timers_core(&cpu_t2, &wall_t2); - OP_kernels[16].time += wall_t2 - wall_t1; - OP_kernels[16].transfer += (float)set->size * arg0.size; - OP_kernels[16].transfer += (float)set->size * arg1.size * 2.0f; + OP_kernels[15].time += wall_t2 - wall_t1; + OP_kernels[15].transfer += (float)set->size * arg0.size; + OP_kernels[15].transfer += (float)set->size * arg1.size * 2.0f; } diff --git a/sp/cuda/initU_formula_kernel.cu b/sp/cuda/initU_formula_kernel.cu index 078818a..dfb9a63 100644 --- a/sp/cuda/initU_formula_kernel.cu +++ b/sp/cuda/initU_formula_kernel.cu @@ -8,8 +8,8 @@ __device__ void initU_formula_gpu( const float *coords, float *values, const dou float y = coords[1]; float t = *time; float val = 0.0f; - values[1] += val; + } // CUDA kernel function @@ -47,18 +47,18 @@ void op_par_loop_initU_formula(char const *name, op_set set, // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; - op_timing_realloc(7); + op_timing_realloc(6); op_timers_core(&cpu_t1, &wall_t1); - OP_kernels[7].name = name; - OP_kernels[7].count += 1; + OP_kernels[6].name = name; + OP_kernels[6].count += 1; if (OP_diags>2) { printf(" kernel routine w/o indirection: initU_formula"); } - op_mpi_halo_exchanges_cuda(set, nargs, args); - if (set->size > 0) { + int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2); + if (set_size > 0) { //transfer constants to GPU int consts_bytes = 0; @@ -74,11 +74,10 @@ void op_par_loop_initU_formula(char const *name, op_set set, mvConstArraysToDevice(consts_bytes); //set CUDA execution parameters - #ifdef OP_BLOCK_SIZE_7 - int nthread = OP_BLOCK_SIZE_7; + #ifdef OP_BLOCK_SIZE_6 + int nthread = OP_BLOCK_SIZE_6; #else int nthread = OP_block_size; - // int nthread = 128; #endif int nblocks = 200; @@ -90,10 +89,12 @@ void op_par_loop_initU_formula(char const *name, op_set set, set->size ); } op_mpi_set_dirtybit_cuda(nargs, args); - cutilSafeCall(cudaDeviceSynchronize()); + if (OP_diags>1) { + cutilSafeCall(cudaDeviceSynchronize()); + } //update kernel record op_timers_core(&cpu_t2, &wall_t2); - OP_kernels[7].time += wall_t2 - wall_t1; - OP_kernels[7].transfer += (float)set->size * arg0.size; - OP_kernels[7].transfer += (float)set->size * arg1.size * 2.0f; + OP_kernels[6].time += wall_t2 - wall_t1; + OP_kernels[6].transfer += (float)set->size * arg0.size; + OP_kernels[6].transfer += (float)set->size * arg1.size * 2.0f; } diff --git a/sp/cuda/initV_formula_kernel.cu b/sp/cuda/initV_formula_kernel.cu index 566dfbd..e7dad86 100644 --- a/sp/cuda/initV_formula_kernel.cu +++ b/sp/cuda/initV_formula_kernel.cu @@ -7,8 +7,9 @@ __device__ void initV_formula_gpu( const float *coords, float *values, const dou float x = coords[0]; float y = coords[1]; float t = *time; - float val = 0.0f ;; + float val = 0.0f ;; values[2] += val; + } // CUDA kernel function @@ -46,18 +47,18 @@ void op_par_loop_initV_formula(char const *name, op_set set, // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; - op_timing_realloc(8); + op_timing_realloc(7); op_timers_core(&cpu_t1, &wall_t1); - OP_kernels[8].name = name; - OP_kernels[8].count += 1; + OP_kernels[7].name = name; + OP_kernels[7].count += 1; if (OP_diags>2) { printf(" kernel routine w/o indirection: initV_formula"); } - op_mpi_halo_exchanges_cuda(set, nargs, args); - if (set->size > 0) { + int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2); + if (set_size > 0) { //transfer constants to GPU int consts_bytes = 0; @@ -73,11 +74,10 @@ void op_par_loop_initV_formula(char const *name, op_set set, mvConstArraysToDevice(consts_bytes); //set CUDA execution parameters - #ifdef OP_BLOCK_SIZE_8 - int nthread = OP_BLOCK_SIZE_8; + #ifdef OP_BLOCK_SIZE_7 + int nthread = OP_BLOCK_SIZE_7; #else int nthread = OP_block_size; - // int nthread = 128; #endif int nblocks = 200; @@ -89,10 +89,12 @@ void op_par_loop_initV_formula(char const *name, op_set set, set->size ); } op_mpi_set_dirtybit_cuda(nargs, args); - cutilSafeCall(cudaDeviceSynchronize()); + if (OP_diags>1) { + cutilSafeCall(cudaDeviceSynchronize()); + } //update kernel record op_timers_core(&cpu_t2, &wall_t2); - OP_kernels[8].time += wall_t2 - wall_t1; - OP_kernels[8].transfer += (float)set->size * arg0.size; - OP_kernels[8].transfer += (float)set->size * arg1.size * 2.0f; + OP_kernels[7].time += wall_t2 - wall_t1; + OP_kernels[7].transfer += (float)set->size * arg0.size; + OP_kernels[7].transfer += (float)set->size * arg1.size * 2.0f; } diff --git a/sp/cuda/limiter_kernel.cu b/sp/cuda/limiter_kernel.cu index 94f4e75..93104cc 100644 --- a/sp/cuda/limiter_kernel.cu +++ b/sp/cuda/limiter_kernel.cu @@ -6,12 +6,13 @@ __device__ void limiter_gpu( const float *q, float *lim, const float *value, const float *gradient, const float *edgecenter1, const float *edgecenter2, - const float *edgecenter3, const float *cellcenter) { + const float *edgecenter3, + float *zeroInit, + const float *cellcenter) { float facevalue[3], dx[3], dy[3]; int i, j; float max[3], edgealpha[3]; - dx[0] = (edgecenter1[0] - cellcenter[0]); dy[0] = (edgecenter1[1] - cellcenter[1]); dx[1] = (edgecenter2[0] - cellcenter[0]); @@ -19,7 +20,7 @@ __device__ void limiter_gpu( const float *q, float *lim, dx[2] = (edgecenter3[0] - cellcenter[0]); dy[2] = (edgecenter3[1] - cellcenter[1]); - if((value[0] > EPS) && (q[0]> EPS)){ + if(q[0] > EPS_cuda){ @@ -28,10 +29,10 @@ __device__ void limiter_gpu( const float *q, float *lim, for(j=0;j<4;j++){ for(i =0 ; i<3; i++){ - facevalue[i] = value[j] + ((gradient[2*j]*dx[i]) + (gradient[2*j + 1]*dy[i])); - if(facevalue[i] > value[j]) { + facevalue[i] = value[j] + (((gradient[2*j]*dx[i]) + (gradient[2*j + 1]*dy[i]))); + if(facevalue[i] > q[2*j + 1]) { edgealpha[i] = (q[2*j + 1] - value[j]) / (facevalue[i] - value[j]); - } else if (facevalue[i] < value[j]){ + } else if (facevalue[i] < q[2*j]){ edgealpha[i] = (q[2*j] - value[j]) / (facevalue[i] - value[j]); } else{ edgealpha[i] = 1.0f; @@ -44,13 +45,17 @@ __device__ void limiter_gpu( const float *q, float *lim, lim[0] = lim[0] < lim[1] ? lim[0]: lim[1]; lim[0] = lim[0] < lim[2] ? lim[0]: lim[2]; lim[0] = lim[0] < lim[3] ? lim[0]: lim[3]; - } else { lim[0] = 0.0f; lim[1] = 0.0f; lim[2] = 0.0f; lim[3] = 0.0f; } + zeroInit[0] = 0.0f; + zeroInit[1] = 0.0f; + zeroInit[2] = 0.0f; + zeroInit[3] = 0.0f; + } // CUDA kernel function @@ -61,7 +66,8 @@ __global__ void op_cuda_limiter( float *arg1, const float *__restrict arg2, const float *__restrict arg3, - const float *__restrict arg7, + float *arg7, + const float *__restrict arg8, int block_offset, int *blkmap, int *offset, @@ -71,6 +77,7 @@ __global__ void op_cuda_limiter( int nblocks, int set_size) { + __shared__ int nelem, offset_b; extern __shared__ char shared[]; @@ -107,7 +114,8 @@ __global__ void op_cuda_limiter( ind_arg0+map4idx*2, ind_arg0+map5idx*2, ind_arg0+map6idx*2, - arg7+(n+offset_b)*2); + arg7+(n+offset_b)*4, + arg8+(n+offset_b)*2); } } @@ -121,10 +129,11 @@ void op_par_loop_limiter(char const *name, op_set set, op_arg arg4, op_arg arg5, op_arg arg6, - op_arg arg7){ + op_arg arg7, + op_arg arg8){ - int nargs = 8; - op_arg args[8]; + int nargs = 9; + op_arg args[9]; args[0] = arg0; args[1] = arg1; @@ -134,31 +143,32 @@ void op_par_loop_limiter(char const *name, op_set set, args[5] = arg5; args[6] = arg6; args[7] = arg7; + args[8] = arg8; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; - op_timing_realloc(22); + op_timing_realloc(23); op_timers_core(&cpu_t1, &wall_t1); - OP_kernels[22].name = name; - OP_kernels[22].count += 1; + OP_kernels[23].name = name; + OP_kernels[23].count += 1; int ninds = 1; - int inds[8] = {-1,-1,-1,-1,0,0,0,-1}; + int inds[9] = {-1,-1,-1,-1,0,0,0,-1,-1}; if (OP_diags>2) { printf(" kernel routine with indirection: limiter\n"); } //get plan - #ifdef OP_PART_SIZE_22 - int part_size = OP_PART_SIZE_22; + #ifdef OP_PART_SIZE_23 + int part_size = OP_PART_SIZE_23; #else int part_size = OP_part_size; #endif - int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args); - if (set->size > 0) { + int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2); + if (set_size > 0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); @@ -167,10 +177,10 @@ void op_par_loop_limiter(char const *name, op_set set, int block_offset = 0; for ( int col=0; colncolors; col++ ){ if (col==Plan->ncolors_core) { - op_mpi_wait_all_cuda(nargs, args); + op_mpi_wait_all_grouped(nargs, args, 2); } - #ifdef OP_BLOCK_SIZE_22 - int nthread = OP_BLOCK_SIZE_22; + #ifdef OP_BLOCK_SIZE_23 + int nthread = OP_BLOCK_SIZE_23; #else int nthread = OP_block_size; #endif @@ -186,6 +196,7 @@ void op_par_loop_limiter(char const *name, op_set set, (float*)arg2.data_d, (float*)arg3.data_d, (float*)arg7.data_d, + (float*)arg8.data_d, block_offset, Plan->blkmap, Plan->offset, @@ -198,12 +209,14 @@ void op_par_loop_limiter(char const *name, op_set set, } block_offset += Plan->ncolblk[col]; } - OP_kernels[22].transfer += Plan->transfer; - OP_kernels[22].transfer2 += Plan->transfer2; + OP_kernels[23].transfer += Plan->transfer; + OP_kernels[23].transfer2 += Plan->transfer2; } op_mpi_set_dirtybit_cuda(nargs, args); - cutilSafeCall(cudaDeviceSynchronize()); + if (OP_diags>1) { + cutilSafeCall(cudaDeviceSynchronize()); + } //update kernel record op_timers_core(&cpu_t2, &wall_t2); - OP_kernels[22].time += wall_t2 - wall_t1; + OP_kernels[23].time += wall_t2 - wall_t1; } diff --git a/sp/cuda/simulation_1_kernel.cu b/sp/cuda/simulation_1_kernel.cu index c17d11f..c32b693 100644 --- a/sp/cuda/simulation_1_kernel.cu +++ b/sp/cuda/simulation_1_kernel.cu @@ -8,6 +8,7 @@ __device__ void simulation_1_gpu( float *out, const float *in) { out[1] = in[1]; out[2] = in[2]; out[3] = in[3]; + } // CUDA kernel function @@ -40,25 +41,24 @@ void op_par_loop_simulation_1(char const *name, op_set set, // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; - op_timing_realloc(4); + op_timing_realloc(3); op_timers_core(&cpu_t1, &wall_t1); - OP_kernels[4].name = name; - OP_kernels[4].count += 1; + OP_kernels[3].name = name; + OP_kernels[3].count += 1; if (OP_diags>2) { printf(" kernel routine w/o indirection: simulation_1"); } - op_mpi_halo_exchanges_cuda(set, nargs, args); - if (set->size > 0) { + int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2); + if (set_size > 0) { //set CUDA execution parameters - #ifdef OP_BLOCK_SIZE_4 - int nthread = OP_BLOCK_SIZE_4; + #ifdef OP_BLOCK_SIZE_3 + int nthread = OP_BLOCK_SIZE_3; #else int nthread = OP_block_size; - // int nthread = 128; #endif int nblocks = 200; @@ -69,10 +69,12 @@ void op_par_loop_simulation_1(char const *name, op_set set, set->size ); } op_mpi_set_dirtybit_cuda(nargs, args); - cutilSafeCall(cudaDeviceSynchronize()); + if (OP_diags>1) { + cutilSafeCall(cudaDeviceSynchronize()); + } //update kernel record op_timers_core(&cpu_t2, &wall_t2); - OP_kernels[4].time += wall_t2 - wall_t1; - OP_kernels[4].transfer += (float)set->size * arg0.size * 2.0f; - OP_kernels[4].transfer += (float)set->size * arg1.size; + OP_kernels[3].time += wall_t2 - wall_t1; + OP_kernels[3].transfer += (float)set->size * arg0.size * 2.0f; + OP_kernels[3].transfer += (float)set->size * arg1.size; } diff --git a/sp/cuda/values_operation2_kernel.cu b/sp/cuda/values_operation2_kernel.cu index 1b0fc07..cc739fb 100644 --- a/sp/cuda/values_operation2_kernel.cu +++ b/sp/cuda/values_operation2_kernel.cu @@ -18,6 +18,7 @@ __device__ void values_operation2_gpu( float *values, const int *result, const i values[*result] = values[*left] / values[*right]; break; } + } // CUDA kernel function @@ -66,18 +67,18 @@ void op_par_loop_values_operation2(char const *name, op_set set, // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; - op_timing_realloc(9); + op_timing_realloc(16); op_timers_core(&cpu_t1, &wall_t1); - OP_kernels[9].name = name; - OP_kernels[9].count += 1; + OP_kernels[16].name = name; + OP_kernels[16].count += 1; if (OP_diags>2) { printf(" kernel routine w/o indirection: values_operation2"); } - op_mpi_halo_exchanges_cuda(set, nargs, args); - if (set->size > 0) { + int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2); + if (set_size > 0) { //transfer constants to GPU int consts_bytes = 0; @@ -114,11 +115,10 @@ void op_par_loop_values_operation2(char const *name, op_set set, mvConstArraysToDevice(consts_bytes); //set CUDA execution parameters - #ifdef OP_BLOCK_SIZE_9 - int nthread = OP_BLOCK_SIZE_9; + #ifdef OP_BLOCK_SIZE_16 + int nthread = OP_BLOCK_SIZE_16; #else int nthread = OP_block_size; - // int nthread = 128; #endif int nblocks = 200; @@ -132,9 +132,11 @@ void op_par_loop_values_operation2(char const *name, op_set set, set->size ); } op_mpi_set_dirtybit_cuda(nargs, args); - cutilSafeCall(cudaDeviceSynchronize()); + if (OP_diags>1) { + cutilSafeCall(cudaDeviceSynchronize()); + } //update kernel record op_timers_core(&cpu_t2, &wall_t2); - OP_kernels[9].time += wall_t2 - wall_t1; - OP_kernels[9].transfer += (float)set->size * arg0.size * 2.0f; + OP_kernels[16].time += wall_t2 - wall_t1; + OP_kernels[16].transfer += (float)set->size * arg0.size * 2.0f; } diff --git a/sp/cuda/volna_hybkernels.cu b/sp/cuda/volna_hybkernels.cu index c96ba4e..b9a2436 100644 --- a/sp/cuda/volna_hybkernels.cu +++ b/sp/cuda/volna_hybkernels.cu @@ -4,209 +4,175 @@ //header #ifdef GPUPASS -#define op_par_loop_EvolveValuesRK3_1 op_par_loop_EvolveValuesRK3_1_gpu -#define op_par_loop_EvolveValuesRK3_2 op_par_loop_EvolveValuesRK3_2_gpu -#define op_par_loop_EvolveValuesRK3_3 op_par_loop_EvolveValuesRK3_3_gpu -#define op_par_loop_EvolveValuesRK3_4 op_par_loop_EvolveValuesRK3_4_gpu +#define op_par_loop_EvolveValuesRK2_1 op_par_loop_EvolveValuesRK2_1_gpu +#define op_par_loop_EvolveValuesRK2_2 op_par_loop_EvolveValuesRK2_2_gpu +#define op_par_loop_Friction_manning op_par_loop_Friction_manning_gpu #define op_par_loop_simulation_1 op_par_loop_simulation_1_gpu #define op_par_loop_incConst op_par_loop_incConst_gpu #define op_par_loop_initEta_formula op_par_loop_initEta_formula_gpu #define op_par_loop_initU_formula op_par_loop_initU_formula_gpu #define op_par_loop_initV_formula op_par_loop_initV_formula_gpu -#define op_par_loop_values_operation2 op_par_loop_values_operation2_gpu #define op_par_loop_applyConst op_par_loop_applyConst_gpu #define op_par_loop_initBathymetry_large op_par_loop_initBathymetry_large_gpu #define op_par_loop_initBathyRelative_formula op_par_loop_initBathyRelative_formula_gpu #define op_par_loop_initBathymetry_formula op_par_loop_initBathymetry_formula_gpu +#define op_par_loop_zero_bathy op_par_loop_zero_bathy_gpu #define op_par_loop_initBathymetry_update op_par_loop_initBathymetry_update_gpu #define op_par_loop_initBore_select op_par_loop_initBore_select_gpu #define op_par_loop_initGaussianLandslide op_par_loop_initGaussianLandslide_gpu +#define op_par_loop_values_operation2 op_par_loop_values_operation2_gpu #define op_par_loop_getTotalVol op_par_loop_getTotalVol_gpu #define op_par_loop_getMaxElevation op_par_loop_getMaxElevation_gpu #define op_par_loop_getMaxSpeed op_par_loop_getMaxSpeed_gpu #define op_par_loop_gatherLocations op_par_loop_gatherLocations_gpu +#define op_par_loop_toOutputs op_par_loop_toOutputs_gpu #define op_par_loop_computeGradient op_par_loop_computeGradient_gpu #define op_par_loop_limiter op_par_loop_limiter_gpu #define op_par_loop_computeFluxes op_par_loop_computeFluxes_gpu +#define op_par_loop_Timestep op_par_loop_Timestep_gpu #define op_par_loop_NumericalFluxes op_par_loop_NumericalFluxes_gpu -#define op_par_loop_SpaceDiscretization op_par_loop_SpaceDiscretization_gpu +#define op_par_loop_computeFluxes_sph op_par_loop_computeFluxes_sph_gpu +#define op_par_loop_NumericalFluxes_sph op_par_loop_NumericalFluxes_sph_gpu #include "volna_kernels.cu" -#undef op_par_loop_EvolveValuesRK3_1 -#undef op_par_loop_EvolveValuesRK3_2 -#undef op_par_loop_EvolveValuesRK3_3 -#undef op_par_loop_EvolveValuesRK3_4 +#undef op_par_loop_EvolveValuesRK2_1 +#undef op_par_loop_EvolveValuesRK2_2 +#undef op_par_loop_Friction_manning #undef op_par_loop_simulation_1 #undef op_par_loop_incConst #undef op_par_loop_initEta_formula #undef op_par_loop_initU_formula #undef op_par_loop_initV_formula -#undef op_par_loop_values_operation2 #undef op_par_loop_applyConst #undef op_par_loop_initBathymetry_large #undef op_par_loop_initBathyRelative_formula #undef op_par_loop_initBathymetry_formula +#undef op_par_loop_zero_bathy #undef op_par_loop_initBathymetry_update #undef op_par_loop_initBore_select #undef op_par_loop_initGaussianLandslide +#undef op_par_loop_values_operation2 #undef op_par_loop_getTotalVol #undef op_par_loop_getMaxElevation #undef op_par_loop_getMaxSpeed #undef op_par_loop_gatherLocations +#undef op_par_loop_toOutputs #undef op_par_loop_computeGradient #undef op_par_loop_limiter #undef op_par_loop_computeFluxes +#undef op_par_loop_Timestep #undef op_par_loop_NumericalFluxes -#undef op_par_loop_SpaceDiscretization +#undef op_par_loop_computeFluxes_sph +#undef op_par_loop_NumericalFluxes_sph #else -#define op_par_loop_EvolveValuesRK3_1 op_par_loop_EvolveValuesRK3_1_cpu -#define op_par_loop_EvolveValuesRK3_2 op_par_loop_EvolveValuesRK3_2_cpu -#define op_par_loop_EvolveValuesRK3_3 op_par_loop_EvolveValuesRK3_3_cpu -#define op_par_loop_EvolveValuesRK3_4 op_par_loop_EvolveValuesRK3_4_cpu +#define op_par_loop_EvolveValuesRK2_1 op_par_loop_EvolveValuesRK2_1_cpu +#define op_par_loop_EvolveValuesRK2_2 op_par_loop_EvolveValuesRK2_2_cpu +#define op_par_loop_Friction_manning op_par_loop_Friction_manning_cpu #define op_par_loop_simulation_1 op_par_loop_simulation_1_cpu #define op_par_loop_incConst op_par_loop_incConst_cpu #define op_par_loop_initEta_formula op_par_loop_initEta_formula_cpu #define op_par_loop_initU_formula op_par_loop_initU_formula_cpu #define op_par_loop_initV_formula op_par_loop_initV_formula_cpu -#define op_par_loop_values_operation2 op_par_loop_values_operation2_cpu #define op_par_loop_applyConst op_par_loop_applyConst_cpu #define op_par_loop_initBathymetry_large op_par_loop_initBathymetry_large_cpu #define op_par_loop_initBathyRelative_formula op_par_loop_initBathyRelative_formula_cpu #define op_par_loop_initBathymetry_formula op_par_loop_initBathymetry_formula_cpu +#define op_par_loop_zero_bathy op_par_loop_zero_bathy_cpu #define op_par_loop_initBathymetry_update op_par_loop_initBathymetry_update_cpu #define op_par_loop_initBore_select op_par_loop_initBore_select_cpu #define op_par_loop_initGaussianLandslide op_par_loop_initGaussianLandslide_cpu +#define op_par_loop_values_operation2 op_par_loop_values_operation2_cpu #define op_par_loop_getTotalVol op_par_loop_getTotalVol_cpu #define op_par_loop_getMaxElevation op_par_loop_getMaxElevation_cpu #define op_par_loop_getMaxSpeed op_par_loop_getMaxSpeed_cpu #define op_par_loop_gatherLocations op_par_loop_gatherLocations_cpu +#define op_par_loop_toOutputs op_par_loop_toOutputs_cpu #define op_par_loop_computeGradient op_par_loop_computeGradient_cpu #define op_par_loop_limiter op_par_loop_limiter_cpu #define op_par_loop_computeFluxes op_par_loop_computeFluxes_cpu +#define op_par_loop_Timestep op_par_loop_Timestep_cpu #define op_par_loop_NumericalFluxes op_par_loop_NumericalFluxes_cpu -#define op_par_loop_SpaceDiscretization op_par_loop_SpaceDiscretization_cpu +#define op_par_loop_computeFluxes_sph op_par_loop_computeFluxes_sph_cpu +#define op_par_loop_NumericalFluxes_sph op_par_loop_NumericalFluxes_sph_cpu #include "../openmp/volna_kernels.cpp" -#undef op_par_loop_EvolveValuesRK3_1 -#undef op_par_loop_EvolveValuesRK3_2 -#undef op_par_loop_EvolveValuesRK3_3 -#undef op_par_loop_EvolveValuesRK3_4 +#undef op_par_loop_EvolveValuesRK2_1 +#undef op_par_loop_EvolveValuesRK2_2 +#undef op_par_loop_Friction_manning #undef op_par_loop_simulation_1 #undef op_par_loop_incConst #undef op_par_loop_initEta_formula #undef op_par_loop_initU_formula #undef op_par_loop_initV_formula -#undef op_par_loop_values_operation2 #undef op_par_loop_applyConst #undef op_par_loop_initBathymetry_large #undef op_par_loop_initBathyRelative_formula #undef op_par_loop_initBathymetry_formula +#undef op_par_loop_zero_bathy #undef op_par_loop_initBathymetry_update #undef op_par_loop_initBore_select #undef op_par_loop_initGaussianLandslide +#undef op_par_loop_values_operation2 #undef op_par_loop_getTotalVol #undef op_par_loop_getMaxElevation #undef op_par_loop_getMaxSpeed #undef op_par_loop_gatherLocations +#undef op_par_loop_toOutputs #undef op_par_loop_computeGradient #undef op_par_loop_limiter #undef op_par_loop_computeFluxes +#undef op_par_loop_Timestep #undef op_par_loop_NumericalFluxes -#undef op_par_loop_SpaceDiscretization +#undef op_par_loop_computeFluxes_sph +#undef op_par_loop_NumericalFluxes_sph //user kernel files -void op_par_loop_EvolveValuesRK3_1_gpu(char const *name, op_set set, +void op_par_loop_EvolveValuesRK2_1_gpu(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, - op_arg arg3, - op_arg arg4); + op_arg arg3); //GPU host stub function #if OP_HYBRID_GPU -void op_par_loop_EvolveValuesRK3_1(char const *name, op_set set, +void op_par_loop_EvolveValuesRK2_1(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, - op_arg arg3, - op_arg arg4){ + op_arg arg3){ if (OP_hybrid_gpu) { - op_par_loop_EvolveValuesRK3_1_gpu(name, set, + op_par_loop_EvolveValuesRK2_1_gpu(name, set, arg0, arg1, arg2, - arg3, - arg4); + arg3); }else{ - op_par_loop_EvolveValuesRK3_1_cpu(name, set, + op_par_loop_EvolveValuesRK2_1_cpu(name, set, arg0, arg1, arg2, - arg3, - arg4); + arg3); } } #else -void op_par_loop_EvolveValuesRK3_1(char const *name, op_set set, +void op_par_loop_EvolveValuesRK2_1(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, - op_arg arg3, - op_arg arg4){ + op_arg arg3){ - op_par_loop_EvolveValuesRK3_1_gpu(name, set, + op_par_loop_EvolveValuesRK2_1_gpu(name, set, arg0, arg1, arg2, - arg3, - arg4); - - } -#endif //OP_HYBRID_GPU - -void op_par_loop_EvolveValuesRK3_2_gpu(char const *name, op_set set, - op_arg arg0, - op_arg arg1, - op_arg arg2); - -//GPU host stub function -#if OP_HYBRID_GPU -void op_par_loop_EvolveValuesRK3_2(char const *name, op_set set, - op_arg arg0, - op_arg arg1, - op_arg arg2){ - - if (OP_hybrid_gpu) { - op_par_loop_EvolveValuesRK3_2_gpu(name, set, - arg0, - arg1, - arg2); - - }else{ - op_par_loop_EvolveValuesRK3_2_cpu(name, set, - arg0, - arg1, - arg2); - - } -} -#else -void op_par_loop_EvolveValuesRK3_2(char const *name, op_set set, - op_arg arg0, - op_arg arg1, - op_arg arg2){ - - op_par_loop_EvolveValuesRK3_2_gpu(name, set, - arg0, - arg1, - arg2); + arg3); } #endif //OP_HYBRID_GPU -void op_par_loop_EvolveValuesRK3_3_gpu(char const *name, op_set set, +void op_par_loop_EvolveValuesRK2_2_gpu(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, @@ -215,7 +181,7 @@ void op_par_loop_EvolveValuesRK3_3_gpu(char const *name, op_set set, //GPU host stub function #if OP_HYBRID_GPU -void op_par_loop_EvolveValuesRK3_3(char const *name, op_set set, +void op_par_loop_EvolveValuesRK2_2(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, @@ -223,7 +189,7 @@ void op_par_loop_EvolveValuesRK3_3(char const *name, op_set set, op_arg arg4){ if (OP_hybrid_gpu) { - op_par_loop_EvolveValuesRK3_3_gpu(name, set, + op_par_loop_EvolveValuesRK2_2_gpu(name, set, arg0, arg1, arg2, @@ -231,7 +197,7 @@ void op_par_loop_EvolveValuesRK3_3(char const *name, op_set set, arg4); }else{ - op_par_loop_EvolveValuesRK3_3_cpu(name, set, + op_par_loop_EvolveValuesRK2_2_cpu(name, set, arg0, arg1, arg2, @@ -241,14 +207,14 @@ void op_par_loop_EvolveValuesRK3_3(char const *name, op_set set, } } #else -void op_par_loop_EvolveValuesRK3_3(char const *name, op_set set, +void op_par_loop_EvolveValuesRK2_2(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4){ - op_par_loop_EvolveValuesRK3_3_gpu(name, set, + op_par_loop_EvolveValuesRK2_2_gpu(name, set, arg0, arg1, arg2, @@ -258,48 +224,42 @@ void op_par_loop_EvolveValuesRK3_3(char const *name, op_set set, } #endif //OP_HYBRID_GPU -void op_par_loop_EvolveValuesRK3_4_gpu(char const *name, op_set set, +void op_par_loop_Friction_manning_gpu(char const *name, op_set set, op_arg arg0, op_arg arg1, - op_arg arg2, - op_arg arg3); + op_arg arg2); //GPU host stub function #if OP_HYBRID_GPU -void op_par_loop_EvolveValuesRK3_4(char const *name, op_set set, +void op_par_loop_Friction_manning(char const *name, op_set set, op_arg arg0, op_arg arg1, - op_arg arg2, - op_arg arg3){ + op_arg arg2){ if (OP_hybrid_gpu) { - op_par_loop_EvolveValuesRK3_4_gpu(name, set, + op_par_loop_Friction_manning_gpu(name, set, arg0, arg1, - arg2, - arg3); + arg2); }else{ - op_par_loop_EvolveValuesRK3_4_cpu(name, set, + op_par_loop_Friction_manning_cpu(name, set, arg0, arg1, - arg2, - arg3); + arg2); } } #else -void op_par_loop_EvolveValuesRK3_4(char const *name, op_set set, +void op_par_loop_Friction_manning(char const *name, op_set set, op_arg arg0, op_arg arg1, - op_arg arg2, - op_arg arg3){ + op_arg arg2){ - op_par_loop_EvolveValuesRK3_4_gpu(name, set, + op_par_loop_Friction_manning_gpu(name, set, arg0, arg1, - arg2, - arg3); + arg2); } #endif //OP_HYBRID_GPU @@ -498,58 +458,6 @@ void op_par_loop_initV_formula(char const *name, op_set set, } #endif //OP_HYBRID_GPU -void op_par_loop_values_operation2_gpu(char const *name, op_set set, - op_arg arg0, - op_arg arg1, - op_arg arg2, - op_arg arg3, - op_arg arg4); - -//GPU host stub function -#if OP_HYBRID_GPU -void op_par_loop_values_operation2(char const *name, op_set set, - op_arg arg0, - op_arg arg1, - op_arg arg2, - op_arg arg3, - op_arg arg4){ - - if (OP_hybrid_gpu) { - op_par_loop_values_operation2_gpu(name, set, - arg0, - arg1, - arg2, - arg3, - arg4); - - }else{ - op_par_loop_values_operation2_cpu(name, set, - arg0, - arg1, - arg2, - arg3, - arg4); - - } -} -#else -void op_par_loop_values_operation2(char const *name, op_set set, - op_arg arg0, - op_arg arg1, - op_arg arg2, - op_arg arg3, - op_arg arg4){ - - op_par_loop_values_operation2_gpu(name, set, - arg0, - arg1, - arg2, - arg3, - arg4); - - } -#endif //OP_HYBRID_GPU - void op_par_loop_applyConst_gpu(char const *name, op_set set, op_arg arg0, op_arg arg1, @@ -746,40 +654,86 @@ void op_par_loop_initBathymetry_formula(char const *name, op_set set, } #endif //OP_HYBRID_GPU -void op_par_loop_initBathymetry_update_gpu(char const *name, op_set set, +void op_par_loop_zero_bathy_gpu(char const *name, op_set set, op_arg arg0, op_arg arg1); //GPU host stub function #if OP_HYBRID_GPU -void op_par_loop_initBathymetry_update(char const *name, op_set set, +void op_par_loop_zero_bathy(char const *name, op_set set, op_arg arg0, op_arg arg1){ if (OP_hybrid_gpu) { - op_par_loop_initBathymetry_update_gpu(name, set, + op_par_loop_zero_bathy_gpu(name, set, arg0, arg1); }else{ - op_par_loop_initBathymetry_update_cpu(name, set, + op_par_loop_zero_bathy_cpu(name, set, arg0, arg1); } } #else -void op_par_loop_initBathymetry_update(char const *name, op_set set, +void op_par_loop_zero_bathy(char const *name, op_set set, op_arg arg0, op_arg arg1){ - op_par_loop_initBathymetry_update_gpu(name, set, + op_par_loop_zero_bathy_gpu(name, set, arg0, arg1); } #endif //OP_HYBRID_GPU +void op_par_loop_initBathymetry_update_gpu(char const *name, op_set set, + op_arg arg0, + op_arg arg1, + op_arg arg2, + op_arg arg3); + +//GPU host stub function +#if OP_HYBRID_GPU +void op_par_loop_initBathymetry_update(char const *name, op_set set, + op_arg arg0, + op_arg arg1, + op_arg arg2, + op_arg arg3){ + + if (OP_hybrid_gpu) { + op_par_loop_initBathymetry_update_gpu(name, set, + arg0, + arg1, + arg2, + arg3); + + }else{ + op_par_loop_initBathymetry_update_cpu(name, set, + arg0, + arg1, + arg2, + arg3); + + } +} +#else +void op_par_loop_initBathymetry_update(char const *name, op_set set, + op_arg arg0, + op_arg arg1, + op_arg arg2, + op_arg arg3){ + + op_par_loop_initBathymetry_update_gpu(name, set, + arg0, + arg1, + arg2, + arg3); + + } +#endif //OP_HYBRID_GPU + void op_par_loop_initBore_select_gpu(char const *name, op_set set, op_arg arg0, op_arg arg1, @@ -926,6 +880,58 @@ void op_par_loop_initGaussianLandslide(char const *name, op_set set, } #endif //OP_HYBRID_GPU +void op_par_loop_values_operation2_gpu(char const *name, op_set set, + op_arg arg0, + op_arg arg1, + op_arg arg2, + op_arg arg3, + op_arg arg4); + +//GPU host stub function +#if OP_HYBRID_GPU +void op_par_loop_values_operation2(char const *name, op_set set, + op_arg arg0, + op_arg arg1, + op_arg arg2, + op_arg arg3, + op_arg arg4){ + + if (OP_hybrid_gpu) { + op_par_loop_values_operation2_gpu(name, set, + arg0, + arg1, + arg2, + arg3, + arg4); + + }else{ + op_par_loop_values_operation2_cpu(name, set, + arg0, + arg1, + arg2, + arg3, + arg4); + + } +} +#else +void op_par_loop_values_operation2(char const *name, op_set set, + op_arg arg0, + op_arg arg1, + op_arg arg2, + op_arg arg3, + op_arg arg4){ + + op_par_loop_values_operation2_gpu(name, set, + arg0, + arg1, + arg2, + arg3, + arg4); + + } +#endif //OP_HYBRID_GPU + void op_par_loop_getTotalVol_gpu(char const *name, op_set set, op_arg arg0, op_arg arg1, @@ -1036,34 +1042,80 @@ void op_par_loop_getMaxSpeed(char const *name, op_set set, void op_par_loop_gatherLocations_gpu(char const *name, op_set set, op_arg arg0, - op_arg arg1); + op_arg arg1, + op_arg arg2); //GPU host stub function #if OP_HYBRID_GPU void op_par_loop_gatherLocations(char const *name, op_set set, op_arg arg0, - op_arg arg1){ + op_arg arg1, + op_arg arg2){ if (OP_hybrid_gpu) { op_par_loop_gatherLocations_gpu(name, set, arg0, - arg1); + arg1, + arg2); }else{ op_par_loop_gatherLocations_cpu(name, set, arg0, - arg1); + arg1, + arg2); } } #else void op_par_loop_gatherLocations(char const *name, op_set set, op_arg arg0, - op_arg arg1){ + op_arg arg1, + op_arg arg2){ op_par_loop_gatherLocations_gpu(name, set, arg0, - arg1); + arg1, + arg2); + + } +#endif //OP_HYBRID_GPU + +void op_par_loop_toOutputs_gpu(char const *name, op_set set, + op_arg arg0, + op_arg arg1, + op_arg arg2); + +//GPU host stub function +#if OP_HYBRID_GPU +void op_par_loop_toOutputs(char const *name, op_set set, + op_arg arg0, + op_arg arg1, + op_arg arg2){ + + if (OP_hybrid_gpu) { + op_par_loop_toOutputs_gpu(name, set, + arg0, + arg1, + arg2); + + }else{ + op_par_loop_toOutputs_cpu(name, set, + arg0, + arg1, + arg2); + + } +} +#else +void op_par_loop_toOutputs(char const *name, op_set set, + op_arg arg0, + op_arg arg1, + op_arg arg2){ + + op_par_loop_toOutputs_gpu(name, set, + arg0, + arg1, + arg2); } #endif //OP_HYBRID_GPU @@ -1158,7 +1210,8 @@ void op_par_loop_limiter_gpu(char const *name, op_set set, op_arg arg4, op_arg arg5, op_arg arg6, - op_arg arg7); + op_arg arg7, + op_arg arg8); //GPU host stub function #if OP_HYBRID_GPU @@ -1170,7 +1223,8 @@ void op_par_loop_limiter(char const *name, op_set set, op_arg arg4, op_arg arg5, op_arg arg6, - op_arg arg7){ + op_arg arg7, + op_arg arg8){ if (OP_hybrid_gpu) { op_par_loop_limiter_gpu(name, set, @@ -1181,7 +1235,8 @@ void op_par_loop_limiter(char const *name, op_set set, arg4, arg5, arg6, - arg7); + arg7, + arg8); }else{ op_par_loop_limiter_cpu(name, set, @@ -1192,7 +1247,8 @@ void op_par_loop_limiter(char const *name, op_set set, arg4, arg5, arg6, - arg7); + arg7, + arg8); } } @@ -1205,7 +1261,8 @@ void op_par_loop_limiter(char const *name, op_set set, op_arg arg4, op_arg arg5, op_arg arg6, - op_arg arg7){ + op_arg arg7, + op_arg arg8){ op_par_loop_limiter_gpu(name, set, arg0, @@ -1215,7 +1272,8 @@ void op_par_loop_limiter(char const *name, op_set set, arg4, arg5, arg6, - arg7); + arg7, + arg8); } #endif //OP_HYBRID_GPU @@ -1235,7 +1293,8 @@ void op_par_loop_computeFluxes_gpu(char const *name, op_set set, op_arg arg11, op_arg arg12, op_arg arg13, - op_arg arg14); + op_arg arg14, + op_arg arg15); //GPU host stub function #if OP_HYBRID_GPU @@ -1254,7 +1313,8 @@ void op_par_loop_computeFluxes(char const *name, op_set set, op_arg arg11, op_arg arg12, op_arg arg13, - op_arg arg14){ + op_arg arg14, + op_arg arg15){ if (OP_hybrid_gpu) { op_par_loop_computeFluxes_gpu(name, set, @@ -1272,7 +1332,8 @@ void op_par_loop_computeFluxes(char const *name, op_set set, arg11, arg12, arg13, - arg14); + arg14, + arg15); }else{ op_par_loop_computeFluxes_cpu(name, set, @@ -1290,7 +1351,8 @@ void op_par_loop_computeFluxes(char const *name, op_set set, arg11, arg12, arg13, - arg14); + arg14, + arg15); } } @@ -1310,7 +1372,8 @@ void op_par_loop_computeFluxes(char const *name, op_set set, op_arg arg11, op_arg arg12, op_arg arg13, - op_arg arg14){ + op_arg arg14, + op_arg arg15){ op_par_loop_computeFluxes_gpu(name, set, arg0, @@ -1327,7 +1390,78 @@ void op_par_loop_computeFluxes(char const *name, op_set set, arg11, arg12, arg13, - arg14); + arg14, + arg15); + + } +#endif //OP_HYBRID_GPU + +void op_par_loop_Timestep_gpu(char const *name, op_set set, + op_arg arg0, + op_arg arg1, + op_arg arg2, + op_arg arg3, + op_arg arg4, + op_arg arg5, + op_arg arg6, + op_arg arg7); + +//GPU host stub function +#if OP_HYBRID_GPU +void op_par_loop_Timestep(char const *name, op_set set, + op_arg arg0, + op_arg arg1, + op_arg arg2, + op_arg arg3, + op_arg arg4, + op_arg arg5, + op_arg arg6, + op_arg arg7){ + + if (OP_hybrid_gpu) { + op_par_loop_Timestep_gpu(name, set, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7); + + }else{ + op_par_loop_Timestep_cpu(name, set, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7); + + } +} +#else +void op_par_loop_Timestep(char const *name, op_set set, + op_arg arg0, + op_arg arg1, + op_arg arg2, + op_arg arg3, + op_arg arg4, + op_arg arg5, + op_arg arg6, + op_arg arg7){ + + op_par_loop_Timestep_gpu(name, set, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7); } #endif //OP_HYBRID_GPU @@ -1340,8 +1474,7 @@ void op_par_loop_NumericalFluxes_gpu(char const *name, op_set set, op_arg arg4, op_arg arg5, op_arg arg6, - op_arg arg7, - op_arg arg8); + op_arg arg7); //GPU host stub function #if OP_HYBRID_GPU @@ -1353,8 +1486,7 @@ void op_par_loop_NumericalFluxes(char const *name, op_set set, op_arg arg4, op_arg arg5, op_arg arg6, - op_arg arg7, - op_arg arg8){ + op_arg arg7){ if (OP_hybrid_gpu) { op_par_loop_NumericalFluxes_gpu(name, set, @@ -1365,8 +1497,7 @@ void op_par_loop_NumericalFluxes(char const *name, op_set set, arg4, arg5, arg6, - arg7, - arg8); + arg7); }else{ op_par_loop_NumericalFluxes_cpu(name, set, @@ -1377,8 +1508,7 @@ void op_par_loop_NumericalFluxes(char const *name, op_set set, arg4, arg5, arg6, - arg7, - arg8); + arg7); } } @@ -1391,10 +1521,119 @@ void op_par_loop_NumericalFluxes(char const *name, op_set set, op_arg arg4, op_arg arg5, op_arg arg6, - op_arg arg7, - op_arg arg8){ + op_arg arg7){ op_par_loop_NumericalFluxes_gpu(name, set, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7); + + } +#endif //OP_HYBRID_GPU + +void op_par_loop_computeFluxes_sph_gpu(char const *name, op_set set, + op_arg arg0, + op_arg arg1, + op_arg arg2, + op_arg arg3, + op_arg arg4, + op_arg arg5, + op_arg arg6, + op_arg arg7, + op_arg arg8, + op_arg arg9, + op_arg arg10, + op_arg arg11, + op_arg arg12, + op_arg arg13, + op_arg arg14, + op_arg arg15); + +//GPU host stub function +#if OP_HYBRID_GPU +void op_par_loop_computeFluxes_sph(char const *name, op_set set, + op_arg arg0, + op_arg arg1, + op_arg arg2, + op_arg arg3, + op_arg arg4, + op_arg arg5, + op_arg arg6, + op_arg arg7, + op_arg arg8, + op_arg arg9, + op_arg arg10, + op_arg arg11, + op_arg arg12, + op_arg arg13, + op_arg arg14, + op_arg arg15){ + + if (OP_hybrid_gpu) { + op_par_loop_computeFluxes_sph_gpu(name, set, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15); + + }else{ + op_par_loop_computeFluxes_sph_cpu(name, set, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15); + + } +} +#else +void op_par_loop_computeFluxes_sph(char const *name, op_set set, + op_arg arg0, + op_arg arg1, + op_arg arg2, + op_arg arg3, + op_arg arg4, + op_arg arg5, + op_arg arg6, + op_arg arg7, + op_arg arg8, + op_arg arg9, + op_arg arg10, + op_arg arg11, + op_arg arg12, + op_arg arg13, + op_arg arg14, + op_arg arg15){ + + op_par_loop_computeFluxes_sph_gpu(name, set, arg0, arg1, arg2, @@ -1403,12 +1642,19 @@ void op_par_loop_NumericalFluxes(char const *name, op_set set, arg5, arg6, arg7, - arg8); + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15); } #endif //OP_HYBRID_GPU -void op_par_loop_SpaceDiscretization_gpu(char const *name, op_set set, +void op_par_loop_NumericalFluxes_sph_gpu(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, @@ -1422,7 +1668,7 @@ void op_par_loop_SpaceDiscretization_gpu(char const *name, op_set set, //GPU host stub function #if OP_HYBRID_GPU -void op_par_loop_SpaceDiscretization(char const *name, op_set set, +void op_par_loop_NumericalFluxes_sph(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, @@ -1435,7 +1681,7 @@ void op_par_loop_SpaceDiscretization(char const *name, op_set set, op_arg arg9){ if (OP_hybrid_gpu) { - op_par_loop_SpaceDiscretization_gpu(name, set, + op_par_loop_NumericalFluxes_sph_gpu(name, set, arg0, arg1, arg2, @@ -1448,7 +1694,7 @@ void op_par_loop_SpaceDiscretization(char const *name, op_set set, arg9); }else{ - op_par_loop_SpaceDiscretization_cpu(name, set, + op_par_loop_NumericalFluxes_sph_cpu(name, set, arg0, arg1, arg2, @@ -1463,7 +1709,7 @@ void op_par_loop_SpaceDiscretization(char const *name, op_set set, } } #else -void op_par_loop_SpaceDiscretization(char const *name, op_set set, +void op_par_loop_NumericalFluxes_sph(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, @@ -1475,7 +1721,7 @@ void op_par_loop_SpaceDiscretization(char const *name, op_set set, op_arg arg8, op_arg arg9){ - op_par_loop_SpaceDiscretization_gpu(name, set, + op_par_loop_NumericalFluxes_sph_gpu(name, set, arg0, arg1, arg2, diff --git a/sp/cuda/volna_kernels.cu b/sp/cuda/volna_kernels.cu index 5a70bac..7a05762 100644 --- a/sp/cuda/volna_kernels.cu +++ b/sp/cuda/volna_kernels.cu @@ -7,9 +7,12 @@ #define MAX_CONST_SIZE 128 #endif -__constant__ float CFL; -__constant__ float EPS; -__constant__ float g; +__constant__ float CFL_cuda; +__constant__ float EPS_cuda; +__constant__ float g_cuda; +__constant__ float Mn_cuda; +__constant__ int spherical_cuda; +__constant__ float Radius_cuda; //header #include "op_lib_cpp.h" @@ -20,15 +23,27 @@ void op_decl_const_char(int dim, char const *type, int size, char *dat, char const *name){ if (!OP_hybrid_gpu) return; if (!strcmp(name,"CFL")) { - cutilSafeCall(cudaMemcpyToSymbol(CFL, dat, dim*size)); + cutilSafeCall(cudaMemcpyToSymbol(CFL_cuda, dat, dim*size)); } else if (!strcmp(name,"EPS")) { - cutilSafeCall(cudaMemcpyToSymbol(EPS, dat, dim*size)); + cutilSafeCall(cudaMemcpyToSymbol(EPS_cuda, dat, dim*size)); } else if (!strcmp(name,"g")) { - cutilSafeCall(cudaMemcpyToSymbol(g, dat, dim*size)); + cutilSafeCall(cudaMemcpyToSymbol(g_cuda, dat, dim*size)); + } + else + if (!strcmp(name,"Mn")) { + cutilSafeCall(cudaMemcpyToSymbol(Mn_cuda, dat, dim*size)); + } + else + if (!strcmp(name,"spherical")) { + cutilSafeCall(cudaMemcpyToSymbol(spherical_cuda, dat, dim*size)); + } + else + if (!strcmp(name,"Radius")) { + cutilSafeCall(cudaMemcpyToSymbol(Radius_cuda, dat, dim*size)); } else { @@ -37,29 +52,32 @@ int size, char *dat, char const *name){ } //user kernel files -#include "EvolveValuesRK3_1_kernel.cu" -#include "EvolveValuesRK3_2_kernel.cu" -#include "EvolveValuesRK3_3_kernel.cu" -#include "EvolveValuesRK3_4_kernel.cu" +#include "EvolveValuesRK2_1_kernel.cu" +#include "EvolveValuesRK2_2_kernel.cu" +#include "Friction_manning_kernel.cu" #include "simulation_1_kernel.cu" #include "incConst_kernel.cu" #include "initEta_formula_kernel.cu" #include "initU_formula_kernel.cu" #include "initV_formula_kernel.cu" -#include "values_operation2_kernel.cu" #include "applyConst_kernel.cu" #include "initBathymetry_large_kernel.cu" #include "initBathyRelative_formula_kernel.cu" #include "initBathymetry_formula_kernel.cu" +#include "zero_bathy_kernel.cu" #include "initBathymetry_update_kernel.cu" #include "initBore_select_kernel.cu" #include "initGaussianLandslide_kernel.cu" +#include "values_operation2_kernel.cu" #include "getTotalVol_kernel.cu" #include "getMaxElevation_kernel.cu" #include "getMaxSpeed_kernel.cu" #include "gatherLocations_kernel.cu" +#include "toOutputs_kernel.cu" #include "computeGradient_kernel.cu" #include "limiter_kernel.cu" #include "computeFluxes_kernel.cu" +#include "Timestep_kernel.cu" #include "NumericalFluxes_kernel.cu" -#include "SpaceDiscretization_kernel.cu" +#include "computeFluxes_sph_kernel.cu" +#include "NumericalFluxes_sph_kernel.cu" diff --git a/sp/volna_op.cpp b/sp/volna_op.cpp index fa4b32e..ac781f9 100644 --- a/sp/volna_op.cpp +++ b/sp/volna_op.cpp @@ -69,9 +69,9 @@ double timestamp = 0.0; int itercount = 0; // Constants -float CFL, g, EPS; +float CFL, g, EPS, Mn, Radius; +int spherical; bool new_format = true; - // Store maximum elevation and speed in global variable, for the sake of max search op_dat currentMaxElevation = NULL; op_dat currentMaxSpeed = NULL; @@ -241,7 +241,8 @@ int main(int argc, char **argv) { * Read constants from HDF5 */ op_get_const_hdf5("CFL", 1, "float", (char *) &CFL, filename_h5); - // op_get_const_hdf5("Mn", 1, "float", (char *) &CFL, filename_h5); + op_get_const_hdf5("Mn", 1, "float", (char *) &Mn, filename_h5); + op_get_const_hdf5("Spherical", 1, "int", (char *) &spherical, filename_h5); // Final time: as defined by Volna the end of real-time simulation float ftime_tmp, dtmax_tmp; op_get_const_hdf5("ftime", 1, "float", (char *) &ftime_tmp, filename_h5); @@ -253,7 +254,19 @@ int main(int argc, char **argv) { op_decl_const2("CFL",1,"float",&CFL); op_decl_const2("EPS",1,"float",&EPS); op_decl_const2("g",1,"float",&g); - + op_decl_const2("Mn",1,"float",&Mn); + op_decl_const2("spherical",1,"int",&spherical); + + Radius = 6378136.6; + op_decl_const2("Radius",1,"float",&Radius); + + printf("Mn = %f, CFL = %f \n", Mn, CFL); + if (spherical){ + printf("Spherical Polar coordinates \n"); + printf("Radius of the Earth = %f \n", Radius); + } else { + printf("Cartesian coordinates \n"); + } //Read InitBathymetry and InitEta event data when they come from files for (unsigned int i = 0; i < events.size(); i++) { if (!strcmp(events[i].className.c_str(), "InitEta")) { @@ -342,8 +355,8 @@ int main(int argc, char **argv) { // op_partition("PARMETIS", "GEOM", NULL, NULL, cellCenters); // op_partition("PTSCOTCH", "GEOM", NULL, NULL, cellCenters); // op_partition("", "", NULL, NULL, NULL); - op_partition("PARMETIS", "KWAY", NULL, edgesToCells, NULL); -// op_partition("PTSCOTCH", "KWAY", NULL, edgesToCells, NULL); + // op_partition("PARMETIS", "KWAY", NULL, edgesToCells, NULL); + op_partition("PTSCOTCH", "KWAY", NULL, edgesToCells, NULL); // op_partition("PARMETIS", "GEOMKWAY", edges, edgesToCells, cellCenters); // op_partition("PARMETIS", "KWAY", NULL, NULL, NULL); // op_partition("PARMETIS", "KWAY", edges, edgesToCells, cellCenters); @@ -385,16 +398,19 @@ int main(int argc, char **argv) { while (timestamp < ftime) { //process post_update==false events (usually Init events) processEvents(&timers, &events, 0, 0, 0.0, 0, 0, cells, values, cellVolumes, cellCenters, nodeCoords, cellsToNodes, temp_initEta, temp_initU, temp_initV, bathy_nodes, lifted_cells, liftedcellsToBathyNodes, liftedcellsToCells, bathy_xy, initial_zb, temp_initBathymetry, z_zero, n_initBathymetry, &zmin, outputLocation_map, outputLocation_dat, writeOption); - - -#ifdef DEBUG -#endif { - float minTimestep = 0.0; + float minTimestep = INFINITY; + if (!spherical){ spaceDiscretization(values, Lw_n, &minTimestep, bathySource, edgeFluxes, maxEdgeEigenvalues, edgeNormals, edgeLength, cellVolumes, isBoundary, cells, edges, edgesToCells, cellsToEdges, cellsToCells, edgeCenters, cellCenters, GradientatCell, q, lim, &zmin); + } else { + spaceDiscretization_sph(values, Lw_n, &minTimestep, + bathySource, edgeFluxes, maxEdgeEigenvalues, + edgeNormals, edgeLength, cellVolumes, isBoundary, + cells, edges, edgesToCells, cellsToEdges, cellsToCells, edgeCenters, cellCenters, GradientatCell, q, lim, &zmin); + } #ifdef DEBUG printf("Return of SpaceDiscretization #1 midPointConservative H %g U %g V %g Zb %g \n", normcomp(Lw_n, 0), normcomp(Lw_n, 1),normcomp(Lw_n, 2),normcomp(Lw_n, 3)); #endif @@ -411,12 +427,19 @@ int main(int argc, char **argv) { #endif float dummy = -1.0f; + if (!spherical){ spaceDiscretization(w_1, Lw_1, &dummy, bathySource, edgeFluxes, maxEdgeEigenvalues, edgeNormals, edgeLength, cellVolumes, isBoundary, cells, edges, edgesToCells, cellsToEdges, cellsToCells, edgeCenters, cellCenters, GradientatCell, q, lim, &zmin); - + } else { + spaceDiscretization_sph(w_1, Lw_1, &dummy, + bathySource, edgeFluxes, maxEdgeEigenvalues, + edgeNormals, edgeLength, cellVolumes, isBoundary, + cells, edges, edgesToCells, cellsToEdges, + cellsToCells, edgeCenters, cellCenters, GradientatCell, q, lim, &zmin); + } op_par_loop_EvolveValuesRK2_2("EvolveValuesRK2_2",cells, op_arg_gbl(&dT,1,"float",OP_READ), @@ -427,16 +450,15 @@ int main(int argc, char **argv) { timestep=dT; - float Mn = 0.025f; op_par_loop_Friction_manning("Friction_manning",cells, op_arg_gbl(&dT,1,"float",OP_READ), op_arg_gbl(&Mn,1,"float",OP_READ), op_arg_dat(values_new,-1,OP_ID,4,"float",OP_RW)); - } + op_par_loop_simulation_1("simulation_1",cells, op_arg_dat(values,-1,OP_ID,4,"float",OP_WRITE), op_arg_dat(values_new,-1,OP_ID,4,"float",OP_READ)); -// printf("Return of SpaceDiscretization #1 midPointConservative H %g U %g V %g Zb %g \n", normcomp(values_new, 0), normcomp(values_new, 1),normcomp(values_new, 2),normcomp(values_new, 3)); + } itercount++; timestamp += timestep; diff --git a/sp/volna_simulation_op.cpp b/sp/volna_simulation_op.cpp index 303827f..a4925c5 100644 --- a/sp/volna_simulation_op.cpp +++ b/sp/volna_simulation_op.cpp @@ -84,6 +84,36 @@ void op_par_loop_NumericalFluxes(char const *, op_set, op_arg, op_arg, op_arg ); + +void op_par_loop_computeFluxes_sph(char const *, op_set, + op_arg, + op_arg, + op_arg, + op_arg, + op_arg, + op_arg, + op_arg, + op_arg, + op_arg, + op_arg, + op_arg, + op_arg, + op_arg, + op_arg, + op_arg, + op_arg ); + +void op_par_loop_NumericalFluxes_sph(char const *, op_set, + op_arg, + op_arg, + op_arg, + op_arg, + op_arg, + op_arg, + op_arg, + op_arg, + op_arg, + op_arg ); #ifdef OPENACC #ifdef __cplusplus } @@ -97,7 +127,8 @@ void op_par_loop_NumericalFluxes(char const *, op_set, #include "computeFluxes.h" #include "Timestep.h" #include "NumericalFluxes.h" - +#include "computeFluxes_sph.h" +#include "NumericalFluxes_sph.h" void spaceDiscretization(op_dat data_in, op_dat data_out, float *minTimestep, @@ -154,7 +185,6 @@ void spaceDiscretization(op_dat data_in, op_dat data_out, float *minTimestep, } if (*minTimestep >= 0.0){ - *minTimestep = INFINITY; op_par_loop_Timestep("Timestep",cells, op_arg_dat(maxEdgeEigenvalues,0,cellsToEdges,1,"float",OP_READ), op_arg_dat(maxEdgeEigenvalues,1,cellsToEdges,1,"float",OP_READ), @@ -177,3 +207,82 @@ void spaceDiscretization(op_dat data_in, op_dat data_out, float *minTimestep, op_arg_dat(cellVolumes,1,edgesToCells,1,"float",OP_READ)); } } + +void spaceDiscretization_sph(op_dat data_in, op_dat data_out, float *minTimestep, + op_dat bathySource, op_dat edgeFluxes, op_dat maxEdgeEigenvalues, + op_dat edgeNormals, op_dat edgeLength, op_dat cellVolumes, op_dat isBoundary, + op_set cells, op_set edges, op_map edgesToCells, op_map cellsToEdges, + op_map cellsToCells, op_dat edgeCenters, op_dat cellCenters, op_dat GradientatCell, op_dat q, op_dat lim, float *zmin) { + { + + { + // TO DO: Pre calculate the geometric mesh quantities + op_par_loop_computeGradient("computeGradient",cells, + op_arg_dat(data_in,-1,OP_ID,4,"float",OP_READ), + op_arg_dat(data_in,0,cellsToCells,4,"float",OP_READ), + op_arg_dat(data_in,1,cellsToCells,4,"float",OP_READ), + op_arg_dat(data_in,2,cellsToCells,4,"float",OP_READ), + op_arg_dat(cellCenters,-1,OP_ID,2,"float",OP_READ), + op_arg_dat(cellCenters,0,cellsToCells,2,"float",OP_READ), + op_arg_dat(cellCenters,1,cellsToCells,2,"float",OP_READ), + op_arg_dat(cellCenters,2,cellsToCells,2,"float",OP_RW), + op_arg_dat(q,-1,OP_ID,8,"float",OP_WRITE), + op_arg_dat(GradientatCell,-1,OP_ID,8,"float",OP_WRITE)); + } + op_par_loop_limiter("limiter",cells, + op_arg_dat(q,-1,OP_ID,8,"float",OP_READ), + op_arg_dat(lim,-1,OP_ID,4,"float",OP_WRITE), + op_arg_dat(data_in,-1,OP_ID,4,"float",OP_READ), + op_arg_dat(GradientatCell,-1,OP_ID,8,"float",OP_READ), + op_arg_dat(edgeCenters,0,cellsToEdges,2,"float",OP_READ), + op_arg_dat(edgeCenters,1,cellsToEdges,2,"float",OP_READ), + op_arg_dat(edgeCenters,2,cellsToEdges,2,"float",OP_READ), + op_arg_dat(data_out,-1,OP_ID,4,"float",OP_WRITE), + op_arg_dat(cellCenters,-1,OP_ID,2,"float",OP_READ)); + + { + op_par_loop_computeFluxes_sph("computeFluxes_sph",edges, + op_arg_dat(data_in,0,edgesToCells,4,"float",OP_READ), + op_arg_dat(data_in,1,edgesToCells,4,"float",OP_READ), + op_arg_dat(lim,0,edgesToCells,4,"float",OP_READ), + op_arg_dat(lim,1,edgesToCells,4,"float",OP_READ), + op_arg_dat(edgeLength,-1,OP_ID,1,"float",OP_READ), + op_arg_dat(edgeNormals,-1,OP_ID,2,"float",OP_READ), + op_arg_dat(cellCenters,0,edgesToCells,2,"float",OP_READ), + op_arg_dat(cellCenters,1,edgesToCells,2,"float",OP_READ), + op_arg_dat(edgeCenters,-1,OP_ID,2,"float",OP_READ), + op_arg_dat(GradientatCell,0,edgesToCells,8,"float",OP_READ), + op_arg_dat(GradientatCell,1,edgesToCells,8,"float",OP_READ), + op_arg_dat(isBoundary,-1,OP_ID,1,"int",OP_READ), + op_arg_dat(bathySource,-1,OP_ID,4,"float",OP_WRITE), + op_arg_dat(edgeFluxes,-1,OP_ID,3,"float",OP_WRITE), + op_arg_dat(maxEdgeEigenvalues,-1,OP_ID,1,"float",OP_WRITE), + op_arg_gbl(zmin,1,"float",OP_READ)); + + } + + if (*minTimestep >= 0.0){ + op_par_loop_Timestep("Timestep",cells, + op_arg_dat(maxEdgeEigenvalues,0,cellsToEdges,1,"float",OP_READ), + op_arg_dat(maxEdgeEigenvalues,1,cellsToEdges,1,"float",OP_READ), + op_arg_dat(maxEdgeEigenvalues,2,cellsToEdges,1,"float",OP_READ), + op_arg_dat(edgeLength,0,cellsToEdges,1,"float",OP_READ), + op_arg_dat(edgeLength,1,cellsToEdges,1,"float",OP_READ), + op_arg_dat(edgeLength,2,cellsToEdges,1,"float",OP_READ), + op_arg_dat(cellVolumes,-1,OP_ID,1,"float",OP_READ), + op_arg_gbl(minTimestep,1,"float",OP_MIN)); + } + + op_par_loop_NumericalFluxes_sph("NumericalFluxes_sph",edges, + op_arg_dat(data_out,0,edgesToCells,4,"float",OP_INC), + op_arg_dat(data_out,1,edgesToCells,4,"float",OP_INC), + op_arg_dat(cellCenters,0,edgesToCells,2,"float",OP_READ), + op_arg_dat(cellCenters,1,edgesToCells,2,"float",OP_READ), + op_arg_dat(edgeFluxes,-1,OP_ID,3,"float",OP_READ), + op_arg_dat(bathySource,-1,OP_ID,4,"float",OP_READ), + op_arg_dat(edgeNormals,-1,OP_ID,2,"float",OP_READ), + op_arg_dat(isBoundary,-1,OP_ID,1,"int",OP_READ), + op_arg_dat(cellVolumes,0,edgesToCells,1,"float",OP_READ), + op_arg_dat(cellVolumes,1,edgesToCells,1,"float",OP_READ)); + } +} From 0008584b0457478da8b1a69ca7d0d2a81b644d89 Mon Sep 17 00:00:00 2001 From: DanGiles Date: Mon, 15 Nov 2021 13:03:15 +0000 Subject: [PATCH 4/4] Corrected Makefile --- sp/Makefile | 9 --------- 1 file changed, 9 deletions(-) diff --git a/sp/Makefile b/sp/Makefile index 32a8d13..101973a 100755 --- a/sp/Makefile +++ b/sp/Makefile @@ -117,20 +117,11 @@ endif cuda/volna_kernels_cu.o: cuda/volna_kernels.cu \ EvolveValuesRK2_1.h EvolveValuesRK2_2.h applyConst.h getMaxElevation.h getMaxSpeed.h \ initBathymetry_formula.h initBathymetry_update.h initBore_select.h initEta_formula.h initGaussianLandslide.h \ -<<<<<<< HEAD initU_formula.h initV_formula.h computeGradient.h computeFluxes.h limiter.h Timestep.h NumericalFluxes.h simulation_1.h \ values_operation2.h Friction_manning.h cuda/applyConst_kernel.cu cuda/EvolveValuesRK2_1_kernel.cu cuda/EvolveValuesRK2_2_kernel.cu \ cuda/computeFluxes_kernel.cu cuda/limiter_kernel.cu cuda/Timestep_kernel.cu cuda/NumericalFluxes_kernel.cu \ cuda/simulation_1_kernel.cu cuda/computeGradient_kernel.cu cuda/getMaxSpeed_kernel.cu cuda/getMaxElevation_kernel.cu \ cuda/values_operation2_kernel.cu cuda/Friction_manning_kernel.cu Makefile -======= - initU_formula.h initV_formula.h computeGradient.h computeFluxes.h limiter.h Timestep.h \ - NumericalFluxes.h simulation_1.h values_operation2.h cuda/applyConst_kernel.cu \ - cuda/EvolveValuesRK2_1_kernel.cu cuda/EvolveValuesRK2_2_kernel.cu cuda/computeFluxes_kernel.cu \ - cuda/limiter_kernel.cu cuda/Timestep_kernel.cu cuda/NumericalFluxes_kernel.cu \ - cuda/simulation_1_kernel.cu cuda/computeGradient_kernel.cu cuda/getMaxSpeed_kernel.cu cuda/getMaxElevation_kernel.cu \ - cuda/values_operation2_kernel.cu Makefile ->>>>>>> 1a93fa5856e7445214d587fca380b11da9ba8b72 nvcc $(VAR) $(INC) $(NVCCFLAGS) $(OP2_INC) $(HDF5_INC) -I$(MPI_INC) -c -o cuda/volna_kernels_cu.o cuda/volna_kernels.cu volna_mpi: volna.cpp volna_event.cpp volna_init.cpp volna_output.cpp volna_writeVTK.cpp volna_simulation.cpp volna_util.cpp Makefile