diff --git a/TestKernels/testKernels.jl b/TestKernels/testKernels.jl index f797d64..3b56877 100644 --- a/TestKernels/testKernels.jl +++ b/TestKernels/testKernels.jl @@ -53,11 +53,9 @@ else backend = CPU() end - - - NzG = min(div(NumberThreadGPU,DoF),Nz) group = (Ord, Ord, NzG, 1) +@show group ndrange = (Ord, Ord, Nz, NF) NumGG = min(div(NumberThreadGPU,Nz),NumG) groupG = (Nz, NumGG) @@ -68,8 +66,11 @@ F = KernelAbstractions.ones(backend,FT,Nz,NumG,NumV + NumTr) CacheF = KernelAbstractions.ones(backend,FT,Nz,NumG,NumV + NumTr) U = KernelAbstractions.ones(backend,FT,Nz,NumG,NumV + NumTr) @. U = abs(rand()) + 1 +@. U[end,:,4] = 0.0 @views Th = U[:,:,5] @views Rho = U[:,:,1] +@views Tr = U[:,:,NumV+1:NumV+NumTr] +@views FTr = F[:,:,NumV+1:NumV+NumTr] D = KernelAbstractions.ones(backend,FT,4,4) DW = KernelAbstractions.ones(backend,FT,4,4) dXdxI = KernelAbstractions.ones(backend,FT,3,3,2,DoF,Nz,NF) @@ -117,15 +118,42 @@ KernelAbstractions.synchronize(backend) KernelAbstractions.synchronize(backend) end +@show "Upwind Tracer" +@. FTh = 0.0 KDivRhoTrUpwind3Kernel! = GPU.DivRhoTrUpwind3Kernel!(backend,group) KDivRhoTrUpwind3Kernel!(FTh,Th,U,D,dXdxI,J,M,Glob,ndrange=ndrange) +@show sum(abs.(FTh)) KernelAbstractions.synchronize(backend) -@show "Upwind Tracer" @time for iter = 1 : TestIter KDivRhoTrUpwind3Kernel!(FTh,Th,U,D,dXdxI,J,M,Glob,ndrange=ndrange) KernelAbstractions.synchronize(backend) end +@show "Upwind Tracer New" +@. FTh = 0.0 +KDivRhoTrUpwind3NewKernel! = GPU.DivRhoTrUpwind3NewKernel!(backend,group) +KDivRhoTrUpwind3NewKernel!(FTh,Th,U,D,dXdxI,J,M,Glob,ndrange=ndrange) +@show sum(abs.(FTh)) +KernelAbstractions.synchronize(backend) +@time for iter = 1 : TestIter + KDivRhoTrUpwind3NewKernel!(FTh,Th,U,D,dXdxI,J,M,Glob,ndrange=ndrange) + KernelAbstractions.synchronize(backend) +end + +@show "Upwind Tracer New1" +@. FTr = 0 +@. Tr[:,:,1] = Th +@. Tr[:,:,2] = Th +KDivRhoTrUpwind3New1Kernel! = GPU.DivRhoTrUpwind3New1Kernel!(backend,group) +KDivRhoTrUpwind3New1Kernel!(FTr,Tr,U,D,dXdxI,J,M,Glob,ndrange=ndrange) +KernelAbstractions.synchronize(backend) +@show sum(abs.(FTr[:,:,1])) +@show sum(abs.(FTr[:,:,2])) +@time for iter = 1 : TestIter + KDivRhoTrUpwind3New1Kernel!(FTr,Tr,U,D,dXdxI,J,M,Glob,ndrange=ndrange) + KernelAbstractions.synchronize(backend) +end + KHyperViscKoeffKernel! = GPU.HyperViscKoeffKernel!(backend,group) KHyperViscKoeffKernel!(F,U,CacheF,D,DW,dXdxI,J,M,Glob,KoeffCurl,KoeffGrad,KoeffDiv,ndrange=ndrange) KernelAbstractions.synchronize(backend) diff --git a/src/GPU/OperatorKernel.jl b/src/GPU/OperatorKernel.jl index 10f7878..77389df 100644 --- a/src/GPU/OperatorKernel.jl +++ b/src/GPU/OperatorKernel.jl @@ -516,6 +516,199 @@ end end end +@kernel inbounds = true function DivRhoTrUpwind3New1Kernel!(FTr,@Const(Tr),@Const(U),@Const(D),@Const(dXdxI), + @Const(JJ),@Const(M),@Const(Glob)) + + I, J, iz = @index(Local, NTuple) + _,_,Iz,IF = @index(Global, NTuple) + + ColumnTilesDim = @uniform @groupsize()[3] + N = @uniform @groupsize()[1] + Nz = @uniform @ndrange()[3] + NF = @uniform @ndrange()[4] + + ID = I + (J - 1) * N + ind = Glob[ID,IF] + + cCol = @localmem eltype(FTr) (N,N, ColumnTilesDim+3) + RhoCol = @localmem eltype(FTr) (N,N, ColumnTilesDim+1) + uCol = @localmem eltype(FTr) (N,N, ColumnTilesDim+1) + vCol = @localmem eltype(FTr) (N,N, ColumnTilesDim+1) + wCol = @localmem eltype(FTr) (N,N, ColumnTilesDim) + JCol = @localmem eltype(FTr) (N,N, ColumnTilesDim+3) + if Iz <= Nz + cCol[I,J,iz+1] = Tr[Iz,ind,1] / U[Iz,ind,1] + JCol[I,J,iz+1] = JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF] + RhoCol[I,J,iz] = U[Iz,ind,1] + uCol[I,J,iz] = U[Iz,ind,2] + vCol[I,J,iz] = U[Iz,ind,3] + wCol[I,J,iz] = U[Iz,ind,4] + end + if iz == 1 + Izm1 = max(Iz - 1,1) + cCol[I,J,iz] = Tr[Izm1,ind,1] / U[Izm1,ind,1] + JCol[I,J,iz] = JJ[ID,1,Izm1,IF] + JJ[ID,2,Izm1,IF] + end + if iz == ColumnTilesDim || Iz == Nz + Izp1 = min(Iz + 1,Nz) + cCol[I,J,iz+2] = Tr[Izp1,ind,1] / U[Izp1,ind,1] + JCol[I,J,iz+2] = JJ[ID,1,Izp1,IF] + JJ[ID,2,Izp1,IF] + Izp2 = min(Iz + 2,Nz) + cCol[I,J,iz+3] = Tr[Izp2,ind,1] / U[Izp2,ind,1] + JCol[I,J,iz+3] = JJ[ID,1,Izp2,IF] + JJ[ID,2,Izp2,IF] + RhoCol[I,J,iz+1] = U[Izp1,ind,1] + uCol[I,J,iz+1] = U[Izp1,ind,2] + vCol[I,J,iz+1] = U[Izp1,ind,3] + end + @synchronize + + ID = I + (J - 1) * N + ind = Glob[ID,IF] + + if Iz < Nz + @views wCon = Contra3(RhoCol[I,J,iz:iz+1],uCol[I,J,iz:iz+1],vCol[I,J,iz:iz+1], + wCol[I,J,iz],dXdxI[3,:,:,ID,Iz:Iz+1,IF]) + wCol[I,J,iz] = wCon + cFL, cFR = RecU4(cCol[I,J,iz],cCol[I,J,iz+1],cCol[I,J,iz+2],cCol[I,J,iz+3], + JCol[I,J,iz],JCol[I,J,iz+1],JCol[I,J,iz+2],JCol[I,J,iz+3]) + Flux = eltype(FTr)(0.25) * ((abs(wCon) + wCon) * cFL + + (-abs(wCon) + wCon) * cFR) + @atomic :monotonic FTr[Iz,ind,1] += -Flux / (M[Iz,ind,1] + M[Iz,ind,2]) + @atomic :monotonic FTr[Iz+1,ind,1] += Flux / (M[Iz+1,ind,1] + M[Iz+1,ind,2]) + end + + if Iz <= Nz + uCon, vCon = Contra12(-RhoCol[I,J,iz],uCol[I,J,iz],vCol[I,J,iz],view(dXdxI,1:2,1:2,:,ID,Iz,IF)) + uCol[I,J,iz] = uCon + vCol[I,J,iz] = vCon + end + @synchronize + + ID = I + (J - 1) * N + ind = Glob[ID,IF] + if Iz <= Nz + DivRhoTr = D[I,1] * uCol[1,J,iz] * cCol[1,J,iz+1] + D[J,1] * vCol[I,1,iz] * cCol[I,1,iz+1] + for k = 2 : N + DivRhoTr += D[I,k] * uCol[k,J,iz] * cCol[k,J,iz+1] + D[J,k] * vCol[I,k,iz] * cCol[I,k,iz+1] + end + @atomic :monotonic FTr[Iz,ind,1] += DivRhoTr / (M[Iz,ind,1] + M[Iz,ind,2]) + end + +# Second tracer + ID = I + (J - 1) * N + ind = Glob[ID,IF] + if Iz <= Nz + cCol[I,J,iz+1] = Tr[Iz,ind,2] / U[Iz,ind,1] + end + if iz == 1 + Izm1 = max(Iz - 1,1) + cCol[I,J,iz] = Tr[Izm1,ind,2] / U[Izm1,ind,1] + end + if iz == ColumnTilesDim || Iz == Nz + Izp1 = min(Iz + 1,Nz) + cCol[I,J,iz+2] = Tr[Izp1,ind,2] / U[Izp1,ind,1] + Izp2 = min(Iz + 2,Nz) + cCol[I,J,iz+3] = Tr[Izp2,ind,2] / U[Izp2,ind,1] + end + @synchronize + + ID = I + (J - 1) * N + ind = Glob[ID,IF] + if Iz < Nz + wCon = wCol[I,J,iz] + cFL, cFR = RecU4(cCol[I,J,iz],cCol[I,J,iz+1],cCol[I,J,iz+2],cCol[I,J,iz+3], + JCol[I,J,iz],JCol[I,J,iz+1],JCol[I,J,iz+2],JCol[I,J,iz+3]) + Flux = eltype(FTr)(0.25) * ((abs(wCon) + wCon) * cFL + + (-abs(wCon) + wCon) * cFR) + @atomic :monotonic FTr[Iz,ind,2] += -Flux / (M[Iz,ind,1] + M[Iz,ind,2]) + @atomic :monotonic FTr[Iz+1,ind,2] += Flux / (M[Iz+1,ind,1] + M[Iz+1,ind,2]) + end + if Iz <= Nz + DivRhoTr = D[I,1] * uCol[1,J,iz] * cCol[1,J,iz+1] + D[J,1] * vCol[I,1,iz] * cCol[I,1,iz+1] + for k = 2 : N + DivRhoTr += D[I,k] * uCol[k,J,iz] * cCol[k,J,iz+1] + D[J,k] * vCol[I,k,iz] * cCol[I,k,iz+1] + end + @atomic :monotonic FTr[Iz,ind,2] += DivRhoTr / (M[Iz,ind,1] + M[Iz,ind,2]) + end +end + +@kernel inbounds = true function DivRhoTrUpwind3NewKernel!(FTr,@Const(Tr),@Const(U),@Const(D),@Const(dXdxI), + @Const(JJ),@Const(M),@Const(Glob)) + + I, J, iz = @index(Local, NTuple) + _,_,Iz,IF = @index(Global, NTuple) + + ColumnTilesDim = @uniform @groupsize()[3] + N = @uniform @groupsize()[1] + Nz = @uniform @ndrange()[3] + NF = @uniform @ndrange()[4] + + ID = I + (J - 1) * N + ind = Glob[ID,IF] + + cCol = @localmem eltype(FTr) (N,N, ColumnTilesDim+3) + RhoCol = @localmem eltype(FTr) (N,N, ColumnTilesDim+1) + uCol = @localmem eltype(FTr) (N,N, ColumnTilesDim+1) + vCol = @localmem eltype(FTr) (N,N, ColumnTilesDim+1) + wCol = @localmem eltype(FTr) (N,N, ColumnTilesDim) + JCol = @localmem eltype(FTr) (N,N, ColumnTilesDim+3) + if Iz <= Nz + cCol[I,J,iz+1] = Tr[Iz,ind] / U[Iz,ind,1] + JCol[I,J,iz+1] = JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF] + RhoCol[I,J,iz] = U[Iz,ind,1] + uCol[I,J,iz] = U[Iz,ind,2] + vCol[I,J,iz] = U[Iz,ind,3] + wCol[I,J,iz] = U[Iz,ind,4] + end + if iz == 1 + Izm1 = max(Iz - 1,1) + cCol[I,J,iz] = Tr[Izm1,ind] / U[Izm1,ind,1] + JCol[I,J,iz] = JJ[ID,1,Izm1,IF] + JJ[ID,2,Izm1,IF] + end + if iz == ColumnTilesDim || Iz == Nz + Izp1 = min(Iz + 1,Nz) + cCol[I,J,iz+2] = Tr[Izp1,ind] / U[Izp1,ind,1] + JCol[I,J,iz+2] = JJ[ID,1,Izp1,IF] + JJ[ID,2,Izp1,IF] + Izp2 = min(Iz + 2,Nz) + cCol[I,J,iz+3] = Tr[Izp2,ind] / U[Izp2,ind,1] + JCol[I,J,iz+3] = JJ[ID,1,Izp2,IF] + JJ[ID,2,Izp2,IF] + RhoCol[I,J,iz+1] = U[Izp1,ind,1] + uCol[I,J,iz+1] = U[Izp1,ind,2] + vCol[I,J,iz+1] = U[Izp1,ind,3] + end + @synchronize + + ID = I + (J - 1) * N + ind = Glob[ID,IF] + if Iz < Nz + @views wCon = Contra3(RhoCol[I,J,iz:iz+1],uCol[I,J,iz:iz+1],vCol[I,J,iz:iz+1], + wCol[I,J,iz],dXdxI[3,:,:,ID,Iz:Iz+1,IF]) + wCol[I,J,iz] = wCon + cFL, cFR = RecU4(cCol[I,J,iz],cCol[I,J,iz+1],cCol[I,J,iz+2],cCol[I,J,iz+3], + JCol[I,J,iz],JCol[I,J,iz+1],JCol[I,J,iz+2],JCol[I,J,iz+3]) + Flux = eltype(FTr)(0.25) * ((abs(wCon) + wCon) * cFL + + (-abs(wCon) + wCon) * cFR) + @atomic :monotonic FTr[Iz,ind] += -Flux / (M[Iz,ind,1] + M[Iz,ind,2]) + @atomic :monotonic FTr[Iz+1,ind] += Flux / (M[Iz+1,ind,1] + M[Iz+1,ind,2]) + end + if Iz <= Nz + uCon, vCon = Contra12(-RhoCol[I,J,iz],uCol[I,J,iz],vCol[I,J,iz],view(dXdxI,1:2,1:2,:,ID,Iz,IF)) + uCol[I,J,iz] = uCon + vCol[I,J,iz] = vCon + end + @synchronize + + ID = I + (J - 1) * N + ind = Glob[ID,IF] + if Iz <= Nz + DivRhoTr = D[I,1] * uCol[1,J,iz] * cCol[1,J,iz+1] + D[J,1] * vCol[I,1,iz] * cCol[I,1,iz+1] + for k = 2 : N + DivRhoTr += D[I,k] * uCol[k,J,iz] * cCol[k,J,iz+1] + D[J,k] * vCol[I,k,iz] * cCol[I,k,iz+1] + end + @atomic :monotonic FTr[Iz,ind] += DivRhoTr / (M[Iz,ind,1] + M[Iz,ind,2]) + end +end + @kernel inbounds = true function AdvectionTrUpwind3Kernel!(FTr,@Const(Tr),@Const(U),@Const(w),@Const(D),@Const(dXdxI), @Const(JJ),@Const(M),@Const(Glob))