Skip to content

Commit

Permalink
Rewriting Kernels
Browse files Browse the repository at this point in the history
  • Loading branch information
OsKnoth committed Oct 9, 2024
1 parent 24b9d74 commit b986216
Show file tree
Hide file tree
Showing 2 changed files with 225 additions and 4 deletions.
36 changes: 32 additions & 4 deletions TestKernels/testKernels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,9 @@ else
backend = CPU()
end




NzG = min(div(NumberThreadGPU,DoF),Nz)
group = (Ord, Ord, NzG, 1)
@show group
ndrange = (Ord, Ord, Nz, NF)
NumGG = min(div(NumberThreadGPU,Nz),NumG)
groupG = (Nz, NumGG)
Expand All @@ -68,8 +66,11 @@ F = KernelAbstractions.ones(backend,FT,Nz,NumG,NumV + NumTr)
CacheF = KernelAbstractions.ones(backend,FT,Nz,NumG,NumV + NumTr)
U = KernelAbstractions.ones(backend,FT,Nz,NumG,NumV + NumTr)
@. U = abs(rand()) + 1
@. U[end,:,4] = 0.0
@views Th = U[:,:,5]
@views Rho = U[:,:,1]
@views Tr = U[:,:,NumV+1:NumV+NumTr]
@views FTr = F[:,:,NumV+1:NumV+NumTr]
D = KernelAbstractions.ones(backend,FT,4,4)
DW = KernelAbstractions.ones(backend,FT,4,4)
dXdxI = KernelAbstractions.ones(backend,FT,3,3,2,DoF,Nz,NF)
Expand Down Expand Up @@ -117,15 +118,42 @@ KernelAbstractions.synchronize(backend)
KernelAbstractions.synchronize(backend)
end

@show "Upwind Tracer"
@. FTh = 0.0
KDivRhoTrUpwind3Kernel! = GPU.DivRhoTrUpwind3Kernel!(backend,group)
KDivRhoTrUpwind3Kernel!(FTh,Th,U,D,dXdxI,J,M,Glob,ndrange=ndrange)
@show sum(abs.(FTh))
KernelAbstractions.synchronize(backend)
@show "Upwind Tracer"
@time for iter = 1 : TestIter
KDivRhoTrUpwind3Kernel!(FTh,Th,U,D,dXdxI,J,M,Glob,ndrange=ndrange)
KernelAbstractions.synchronize(backend)
end

@show "Upwind Tracer New"
@. FTh = 0.0
KDivRhoTrUpwind3NewKernel! = GPU.DivRhoTrUpwind3NewKernel!(backend,group)
KDivRhoTrUpwind3NewKernel!(FTh,Th,U,D,dXdxI,J,M,Glob,ndrange=ndrange)
@show sum(abs.(FTh))
KernelAbstractions.synchronize(backend)
@time for iter = 1 : TestIter
KDivRhoTrUpwind3NewKernel!(FTh,Th,U,D,dXdxI,J,M,Glob,ndrange=ndrange)
KernelAbstractions.synchronize(backend)
end

@show "Upwind Tracer New1"
@. FTr = 0
@. Tr[:,:,1] = Th
@. Tr[:,:,2] = Th
KDivRhoTrUpwind3New1Kernel! = GPU.DivRhoTrUpwind3New1Kernel!(backend,group)
KDivRhoTrUpwind3New1Kernel!(FTr,Tr,U,D,dXdxI,J,M,Glob,ndrange=ndrange)
KernelAbstractions.synchronize(backend)
@show sum(abs.(FTr[:,:,1]))
@show sum(abs.(FTr[:,:,2]))
@time for iter = 1 : TestIter
KDivRhoTrUpwind3New1Kernel!(FTr,Tr,U,D,dXdxI,J,M,Glob,ndrange=ndrange)
KernelAbstractions.synchronize(backend)
end

KHyperViscKoeffKernel! = GPU.HyperViscKoeffKernel!(backend,group)
KHyperViscKoeffKernel!(F,U,CacheF,D,DW,dXdxI,J,M,Glob,KoeffCurl,KoeffGrad,KoeffDiv,ndrange=ndrange)
KernelAbstractions.synchronize(backend)
Expand Down
193 changes: 193 additions & 0 deletions src/GPU/OperatorKernel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,199 @@ end
end
end

@kernel inbounds = true function DivRhoTrUpwind3New1Kernel!(FTr,@Const(Tr),@Const(U),@Const(D),@Const(dXdxI),
@Const(JJ),@Const(M),@Const(Glob))

I, J, iz = @index(Local, NTuple)
_,_,Iz,IF = @index(Global, NTuple)

ColumnTilesDim = @uniform @groupsize()[3]
N = @uniform @groupsize()[1]
Nz = @uniform @ndrange()[3]
NF = @uniform @ndrange()[4]

ID = I + (J - 1) * N
ind = Glob[ID,IF]

cCol = @localmem eltype(FTr) (N,N, ColumnTilesDim+3)
RhoCol = @localmem eltype(FTr) (N,N, ColumnTilesDim+1)
uCol = @localmem eltype(FTr) (N,N, ColumnTilesDim+1)
vCol = @localmem eltype(FTr) (N,N, ColumnTilesDim+1)
wCol = @localmem eltype(FTr) (N,N, ColumnTilesDim)
JCol = @localmem eltype(FTr) (N,N, ColumnTilesDim+3)
if Iz <= Nz
cCol[I,J,iz+1] = Tr[Iz,ind,1] / U[Iz,ind,1]
JCol[I,J,iz+1] = JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]
RhoCol[I,J,iz] = U[Iz,ind,1]
uCol[I,J,iz] = U[Iz,ind,2]
vCol[I,J,iz] = U[Iz,ind,3]
wCol[I,J,iz] = U[Iz,ind,4]
end
if iz == 1
Izm1 = max(Iz - 1,1)
cCol[I,J,iz] = Tr[Izm1,ind,1] / U[Izm1,ind,1]
JCol[I,J,iz] = JJ[ID,1,Izm1,IF] + JJ[ID,2,Izm1,IF]
end
if iz == ColumnTilesDim || Iz == Nz
Izp1 = min(Iz + 1,Nz)
cCol[I,J,iz+2] = Tr[Izp1,ind,1] / U[Izp1,ind,1]
JCol[I,J,iz+2] = JJ[ID,1,Izp1,IF] + JJ[ID,2,Izp1,IF]
Izp2 = min(Iz + 2,Nz)
cCol[I,J,iz+3] = Tr[Izp2,ind,1] / U[Izp2,ind,1]
JCol[I,J,iz+3] = JJ[ID,1,Izp2,IF] + JJ[ID,2,Izp2,IF]
RhoCol[I,J,iz+1] = U[Izp1,ind,1]
uCol[I,J,iz+1] = U[Izp1,ind,2]
vCol[I,J,iz+1] = U[Izp1,ind,3]
end
@synchronize

ID = I + (J - 1) * N
ind = Glob[ID,IF]

if Iz < Nz
@views wCon = Contra3(RhoCol[I,J,iz:iz+1],uCol[I,J,iz:iz+1],vCol[I,J,iz:iz+1],
wCol[I,J,iz],dXdxI[3,:,:,ID,Iz:Iz+1,IF])
wCol[I,J,iz] = wCon
cFL, cFR = RecU4(cCol[I,J,iz],cCol[I,J,iz+1],cCol[I,J,iz+2],cCol[I,J,iz+3],
JCol[I,J,iz],JCol[I,J,iz+1],JCol[I,J,iz+2],JCol[I,J,iz+3])
Flux = eltype(FTr)(0.25) * ((abs(wCon) + wCon) * cFL +
(-abs(wCon) + wCon) * cFR)
@atomic :monotonic FTr[Iz,ind,1] += -Flux / (M[Iz,ind,1] + M[Iz,ind,2])
@atomic :monotonic FTr[Iz+1,ind,1] += Flux / (M[Iz+1,ind,1] + M[Iz+1,ind,2])
end

if Iz <= Nz
uCon, vCon = Contra12(-RhoCol[I,J,iz],uCol[I,J,iz],vCol[I,J,iz],view(dXdxI,1:2,1:2,:,ID,Iz,IF))
uCol[I,J,iz] = uCon
vCol[I,J,iz] = vCon
end
@synchronize

ID = I + (J - 1) * N
ind = Glob[ID,IF]
if Iz <= Nz
DivRhoTr = D[I,1] * uCol[1,J,iz] * cCol[1,J,iz+1] + D[J,1] * vCol[I,1,iz] * cCol[I,1,iz+1]
for k = 2 : N
DivRhoTr += D[I,k] * uCol[k,J,iz] * cCol[k,J,iz+1] + D[J,k] * vCol[I,k,iz] * cCol[I,k,iz+1]
end
@atomic :monotonic FTr[Iz,ind,1] += DivRhoTr / (M[Iz,ind,1] + M[Iz,ind,2])
end

# Second tracer
ID = I + (J - 1) * N
ind = Glob[ID,IF]
if Iz <= Nz
cCol[I,J,iz+1] = Tr[Iz,ind,2] / U[Iz,ind,1]
end
if iz == 1
Izm1 = max(Iz - 1,1)
cCol[I,J,iz] = Tr[Izm1,ind,2] / U[Izm1,ind,1]
end
if iz == ColumnTilesDim || Iz == Nz
Izp1 = min(Iz + 1,Nz)
cCol[I,J,iz+2] = Tr[Izp1,ind,2] / U[Izp1,ind,1]
Izp2 = min(Iz + 2,Nz)
cCol[I,J,iz+3] = Tr[Izp2,ind,2] / U[Izp2,ind,1]
end
@synchronize

ID = I + (J - 1) * N
ind = Glob[ID,IF]
if Iz < Nz
wCon = wCol[I,J,iz]
cFL, cFR = RecU4(cCol[I,J,iz],cCol[I,J,iz+1],cCol[I,J,iz+2],cCol[I,J,iz+3],
JCol[I,J,iz],JCol[I,J,iz+1],JCol[I,J,iz+2],JCol[I,J,iz+3])
Flux = eltype(FTr)(0.25) * ((abs(wCon) + wCon) * cFL +
(-abs(wCon) + wCon) * cFR)
@atomic :monotonic FTr[Iz,ind,2] += -Flux / (M[Iz,ind,1] + M[Iz,ind,2])
@atomic :monotonic FTr[Iz+1,ind,2] += Flux / (M[Iz+1,ind,1] + M[Iz+1,ind,2])
end
if Iz <= Nz
DivRhoTr = D[I,1] * uCol[1,J,iz] * cCol[1,J,iz+1] + D[J,1] * vCol[I,1,iz] * cCol[I,1,iz+1]
for k = 2 : N
DivRhoTr += D[I,k] * uCol[k,J,iz] * cCol[k,J,iz+1] + D[J,k] * vCol[I,k,iz] * cCol[I,k,iz+1]
end
@atomic :monotonic FTr[Iz,ind,2] += DivRhoTr / (M[Iz,ind,1] + M[Iz,ind,2])
end
end

@kernel inbounds = true function DivRhoTrUpwind3NewKernel!(FTr,@Const(Tr),@Const(U),@Const(D),@Const(dXdxI),
@Const(JJ),@Const(M),@Const(Glob))

I, J, iz = @index(Local, NTuple)
_,_,Iz,IF = @index(Global, NTuple)

ColumnTilesDim = @uniform @groupsize()[3]
N = @uniform @groupsize()[1]
Nz = @uniform @ndrange()[3]
NF = @uniform @ndrange()[4]

ID = I + (J - 1) * N
ind = Glob[ID,IF]

cCol = @localmem eltype(FTr) (N,N, ColumnTilesDim+3)
RhoCol = @localmem eltype(FTr) (N,N, ColumnTilesDim+1)
uCol = @localmem eltype(FTr) (N,N, ColumnTilesDim+1)
vCol = @localmem eltype(FTr) (N,N, ColumnTilesDim+1)
wCol = @localmem eltype(FTr) (N,N, ColumnTilesDim)
JCol = @localmem eltype(FTr) (N,N, ColumnTilesDim+3)
if Iz <= Nz
cCol[I,J,iz+1] = Tr[Iz,ind] / U[Iz,ind,1]
JCol[I,J,iz+1] = JJ[ID,1,Iz,IF] + JJ[ID,2,Iz,IF]
RhoCol[I,J,iz] = U[Iz,ind,1]
uCol[I,J,iz] = U[Iz,ind,2]
vCol[I,J,iz] = U[Iz,ind,3]
wCol[I,J,iz] = U[Iz,ind,4]
end
if iz == 1
Izm1 = max(Iz - 1,1)
cCol[I,J,iz] = Tr[Izm1,ind] / U[Izm1,ind,1]
JCol[I,J,iz] = JJ[ID,1,Izm1,IF] + JJ[ID,2,Izm1,IF]
end
if iz == ColumnTilesDim || Iz == Nz
Izp1 = min(Iz + 1,Nz)
cCol[I,J,iz+2] = Tr[Izp1,ind] / U[Izp1,ind,1]
JCol[I,J,iz+2] = JJ[ID,1,Izp1,IF] + JJ[ID,2,Izp1,IF]
Izp2 = min(Iz + 2,Nz)
cCol[I,J,iz+3] = Tr[Izp2,ind] / U[Izp2,ind,1]
JCol[I,J,iz+3] = JJ[ID,1,Izp2,IF] + JJ[ID,2,Izp2,IF]
RhoCol[I,J,iz+1] = U[Izp1,ind,1]
uCol[I,J,iz+1] = U[Izp1,ind,2]
vCol[I,J,iz+1] = U[Izp1,ind,3]
end
@synchronize

ID = I + (J - 1) * N
ind = Glob[ID,IF]
if Iz < Nz
@views wCon = Contra3(RhoCol[I,J,iz:iz+1],uCol[I,J,iz:iz+1],vCol[I,J,iz:iz+1],
wCol[I,J,iz],dXdxI[3,:,:,ID,Iz:Iz+1,IF])
wCol[I,J,iz] = wCon
cFL, cFR = RecU4(cCol[I,J,iz],cCol[I,J,iz+1],cCol[I,J,iz+2],cCol[I,J,iz+3],
JCol[I,J,iz],JCol[I,J,iz+1],JCol[I,J,iz+2],JCol[I,J,iz+3])
Flux = eltype(FTr)(0.25) * ((abs(wCon) + wCon) * cFL +
(-abs(wCon) + wCon) * cFR)
@atomic :monotonic FTr[Iz,ind] += -Flux / (M[Iz,ind,1] + M[Iz,ind,2])
@atomic :monotonic FTr[Iz+1,ind] += Flux / (M[Iz+1,ind,1] + M[Iz+1,ind,2])
end
if Iz <= Nz
uCon, vCon = Contra12(-RhoCol[I,J,iz],uCol[I,J,iz],vCol[I,J,iz],view(dXdxI,1:2,1:2,:,ID,Iz,IF))
uCol[I,J,iz] = uCon
vCol[I,J,iz] = vCon
end
@synchronize

ID = I + (J - 1) * N
ind = Glob[ID,IF]
if Iz <= Nz
DivRhoTr = D[I,1] * uCol[1,J,iz] * cCol[1,J,iz+1] + D[J,1] * vCol[I,1,iz] * cCol[I,1,iz+1]
for k = 2 : N
DivRhoTr += D[I,k] * uCol[k,J,iz] * cCol[k,J,iz+1] + D[J,k] * vCol[I,k,iz] * cCol[I,k,iz+1]
end
@atomic :monotonic FTr[Iz,ind] += DivRhoTr / (M[Iz,ind,1] + M[Iz,ind,2])
end
end

@kernel inbounds = true function AdvectionTrUpwind3Kernel!(FTr,@Const(Tr),@Const(U),@Const(w),@Const(D),@Const(dXdxI),
@Const(JJ),@Const(M),@Const(Glob))

Expand Down

0 comments on commit b986216

Please sign in to comment.