Skip to content

Commit 82513f7

Browse files
committed
TestAMD
1 parent 9afed98 commit 82513f7

File tree

5 files changed

+187
-38
lines changed

5 files changed

+187
-38
lines changed

Examples/testAMD.jl

Lines changed: 117 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -7,51 +7,130 @@ using KernelAbstractions
77
using KernelAbstractions: @atomic, @atomicswap, @atomicreplace
88
using KernelAbstractions.Extras
99

10+
@kernel function MomentumKernel!(F,@Const(U),@Const(D),@Const(dXdxI),
11+
@Const(MRho),@Const(M),@Const(Glob))
1012

13+
I, J, iz = @index(Local, NTuple)
14+
_,_,Iz,IF = @index(Global, NTuple)
1115

12-
@kernel function FluxUpdateKernel!(Flux,@Const(u),@Const(v),@Const(w),@Const(c))
13-
14-
i,j,k = @index(Global, NTuple)
15-
16-
M = @uniform @ndrange()[1]
17-
N = @uniform @ndrange()[2]
18-
L = @uniform @ndrange()[3]
19-
20-
if 1 < i && i < M && j > 1 && j < N && k > 1 && k < L
21-
@inbounds FluxW = eltype(Flux)(0.5) * (u[i-1,j,k] - abs(u[i-1,j,k])) * c[i,j,k]
22-
@inbounds FluxE = eltype(Flux)(0.5) * (u[i,j,k] + abs(u[i,j,k])) * c[i,j,k]
23-
@inbounds FluxS = eltype(Flux)(0.5) * (v[i,j-1,k] - abs(v[i,j-1,k])) * c[i,j,k]
24-
@inbounds FluxN = eltype(Flux)(0.5) * (v[i,j,k] + abs(v[i,j,k])) * c[i,j,k]
25-
@inbounds FluxB = eltype(Flux)(0.5) * (w[i,j,k-1] - abs(w[i,j,k-1])) * c[i,j,k]
26-
@inbounds FluxT = eltype(Flux)(0.5) * (w[i,j,k] + abs(w[i,j,k])) * c[i,j,k]
27-
@inbounds @atomic Flux[i-1,j,k] += -FluxW
28-
@inbounds @atomic Flux[i+1,j,k] += FluxE
29-
@inbounds @atomic Flux[i,j-1,k] += -FluxS
30-
@inbounds @atomic Flux[i,j+1,k] += FluxN
31-
@inbounds @atomic Flux[i,j,k-1] += -FluxB
32-
@inbounds @atomic Flux[i,j,k+1] += FluxT
33-
@inbounds @atomic Flux[i,j,k] += (FluxW - FluxE + FluxS - FluxN + FluxB - FluxT)
16+
ColumnTilesDim = @uniform @groupsize()[3]
17+
N = @uniform @groupsize()[1]
18+
Nz = @uniform @ndrange()[3]
19+
NF = @uniform @ndrange()[4]
20+
21+
ID = I + (J - 1) * N
22+
@inbounds ind = Glob[ID,IF]
23+
24+
RhoCol = @localmem eltype(F) (N,N,ColumnTilesDim)
25+
uCol = @localmem eltype(F) (N,N,ColumnTilesDim)
26+
vCol = @localmem eltype(F) (N,N,ColumnTilesDim)
27+
wCol = @localmem eltype(F) (N,N,ColumnTilesDim+1)
28+
29+
if Iz <= Nz
30+
@inbounds RhoCol[I,J,iz] = U[Iz,ind,1]
31+
@inbounds uCol[I,J,iz] = U[Iz,ind,2]
32+
@inbounds vCol[I,J,iz] = U[Iz,ind,3]
33+
@inbounds wCol[I,J,iz+1] = U[Iz,ind,4]
34+
if Iz == 1
35+
wCol[I,J,1] = -(dXdxI[3,1,1,ID,1,IF] * U[Iz,ind,2] +
36+
dXdxI[3,2,1,ID,1,IF] * U[Iz,ind,3]) / dXdxI[3,3,1,ID,1,IF]
37+
elseif iz == 1
38+
wCol[I,J,1] = U[Iz-1,ind,4]
39+
end
3440
end
35-
end
3641

37-
backend = CUDABackend()
38-
FT = Float32
39-
M = 100
40-
N = 101
41-
L = 102
42+
@synchronize
4243

43-
group = (9, 9 ,9)
44-
ndrange = (M, N , L)
44+
ID = I + (J - 1) * N
45+
@inbounds ind = Glob[ID,IF]
46+
47+
if Iz <= Nz
48+
@inbounds ind = Glob[ID,IF]
49+
@inbounds uCon1 = -RhoCol[I,J,iz] * (dXdxI[1,1,1,ID,Iz,IF] * uCol[I,J,iz] +
50+
dXdxI[1,2,1,ID,Iz,IF] * vCol[I,J,iz] + dXdxI[1,3,1,ID,Iz,IF] * wCol[I,J,iz])
51+
@inbounds uCon2 = -RhoCol[I,J,iz] * (dXdxI[1,1,2,ID,Iz,IF] * uCol[I,J,iz] +
52+
dXdxI[1,2,2,ID,Iz,IF] * vCol[I,J,iz] + dXdxI[1,3,2,ID,Iz,IF] * wCol[I,J,iz+1])
53+
@inbounds vCon1 = -RhoCol[I,J,iz] * (dXdxI[2,1,1,ID,Iz,IF] * uCol[I,J,iz] +
54+
dXdxI[2,2,1,ID,Iz,IF] * vCol[I,J,iz] + dXdxI[2,3,1,ID,Iz,IF] * wCol[I,J,iz])
55+
@inbounds vCon2 = -RhoCol[I,J,iz] * (dXdxI[2,1,2,ID,Iz,IF] * uCol[I,J,iz] +
56+
dXdxI[2,2,2,ID,Iz,IF] * vCol[I,J,iz] + dXdxI[2,3,2,ID,Iz,IF] * wCol[I,J,iz+1])
57+
@inbounds wCon1 = -RhoCol[I,J,iz] * (dXdxI[3,1,1,ID,Iz,IF] * uCol[I,J,iz] +
58+
dXdxI[3,2,1,ID,Iz,IF] * vCol[I,J,iz] + dXdxI[3,3,1,ID,Iz,IF] * wCol[I,J,iz])
59+
@inbounds wCon2 = -RhoCol[I,J,iz] * (dXdxI[3,1,2,ID,Iz,IF] * uCol[I,J,iz] +
60+
dXdxI[3,2,2,ID,Iz,IF] * vCol[I,J,iz] + dXdxI[3,3,2,ID,Iz,IF] * wCol[I,J,iz+1])
61+
62+
@inbounds Dxu = D[I,1] * uCol[1,J,iz]
63+
@inbounds Dyu = D[J,1] * uCol[I,1,iz]
64+
@inbounds Dxv = D[I,1] * vCol[1,J,iz]
65+
@inbounds Dyv = D[J,1] * vCol[I,1,iz]
66+
@inbounds Dxw1 = D[I,1] * wCol[1,J,iz]
67+
@inbounds Dyw1 = D[J,1] * wCol[I,1,iz]
68+
@inbounds Dxw2 = D[I,1] * wCol[1,J,iz+1]
69+
@inbounds Dyw2 = D[J,1] * wCol[I,1,iz+1]
70+
Izp = min(Iz+1,Nz)
71+
Izm = max(Iz-1,1)
72+
ind = Glob[ID,IF]
73+
Dzu2 = eltype(F)(0.5) * (U[Izp,ind,2] - uCol[I,J,iz])
74+
Dzv2 = eltype(F)(0.5) * (U[Izp,ind,3] - vCol[I,J,iz])
75+
Dzu1 = eltype(F)(0.5) * (uCol[I,J,iz] - U[Izm,ind,2])
76+
Dzv1 = eltype(F)(0.5) * (vCol[I,J,iz] - U[Izm,ind,3])
77+
Dzw = eltype(F)(0.5) * (wCol[I,J,iz+1] - wCol[I,J,iz])
78+
for k = 2 : N
79+
@inbounds Dxu += D[I,k] * uCol[k,J,iz]
80+
@inbounds Dyu += D[J,k] * uCol[I,k,iz]
81+
@inbounds Dxv += D[I,k] * vCol[k,J,iz]
82+
@inbounds Dyv += D[J,k] * vCol[I,k,iz]
83+
@inbounds Dxw1 += D[I,k] * wCol[k,J,iz]
84+
@inbounds Dyw1 += D[J,k] * wCol[I,k,iz]
85+
@inbounds Dxw2 += D[I,k] * wCol[k,J,iz+1]
86+
@inbounds Dyw2 += D[J,k] * wCol[I,k,iz+1]
87+
end
88+
89+
@inbounds @atomic F[Iz,ind,2] += ((uCon1 + uCon2) * Dxu + (vCon1 + vCon2) * Dyu +
90+
wCon1 * Dzu1 + wCon2 * Dzu2) / M[Iz,ind] / RhoCol[I,J,iz]
91+
@inbounds @atomic F[Iz,ind,3] += ((uCon1 + uCon2) * Dxv + (vCon1 + vCon2) * Dyv +
92+
wCon1 * Dzv1 + wCon2 * Dzv2) / M[Iz,ind] / RhoCol[I,J,iz]
93+
end
94+
if Iz > 1
95+
@inbounds @atomic F[Iz-1,ind,4] += (uCon1 * Dxw1 + vCon1 * Dyw1 + wCon1 * Dzw) / MRho[Iz-1,ind]
96+
end
97+
if Iz < Nz
98+
@inbounds @atomic F[Iz,ind,4] += (uCon2 * Dxw2 + vCon2 * Dyw2 + wCon2 * Dzw) / MRho[Iz,ind]
99+
end
100+
end
101+
102+
NF = 5400 # 30*30*6
103+
NumG = 48602
104+
Ord = 4
105+
DoF = Ord * Ord
106+
NumV = 5
107+
NumTr = 2
108+
Nz = 64
109+
NumberThreadGPU = 1024
110+
GlobCPU = zeros(Int,DoF,NF)
111+
read!("GlobInd",GlobCPU)
112+
113+
114+
115+
backend = CPU()
116+
#backend = CUDABackend()
117+
FT = Float32
118+
NzG = min(div(NumberThreadGPU,DoF),Nz)
119+
group = (Ord, Ord, NzG, 1)
120+
ndrange = (Ord, Ord, Nz, NF)
45121

46-
u = KernelAbstractions.ones(backend,FT,M+1,N,L)
47-
v = KernelAbstractions.ones(backend,FT,M,N+1,L)
48-
w = KernelAbstractions.ones(backend,FT,M,N,L+1)
49-
c = KernelAbstractions.ones(backend,FT,M,N,L)
50-
Flux = KernelAbstractions.zeros(backend,FT,M,N,L)
122+
F = KernelAbstractions.ones(backend,FT,Nz,NumG,NumV + NumTr)
123+
U = KernelAbstractions.ones(backend,FT,Nz,NumG,NumV + NumTr)
124+
D = KernelAbstractions.ones(backend,FT,4,4)
125+
dXdxI = KernelAbstractions.ones(backend,FT,3,3,2,DoF,Nz,NF)
126+
MRho = KernelAbstractions.ones(backend,FT,Nz-1,NumG)
127+
M = KernelAbstractions.ones(backend,FT,Nz,NumG)
128+
Glob = KernelAbstractions.zeros(backend,Int,DoF,NF)
129+
copyto!(Glob,GlobCPU)
51130

52-
KFluxUpdateKernel! = FluxUpdateKernel!(backend,group)
131+
KMomentumKernel! = MomentumKernel!(backend,group)
53132

54-
KFluxUpdateKernel!(Flux,u,v,w,c,ndrange=ndrange)
133+
KMomentumKernel!(F,U,D,dXdxI,MRho,M,Glob,ndrange=ndrange)
55134
@time for iter = 1 : 100
56-
KFluxUpdateKernel!(Flux,u,v,w,c,ndrange=ndrange)
135+
KMomentumKernel!(F,U,D,dXdxI,MRho,M,Glob,ndrange=ndrange)
57136
end

Examples/testAMD1.jl

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
using MPI
2+
using Base
3+
using CUDA
4+
using AMDGPU
5+
using KernelAbstractions
6+
using KernelAbstractions
7+
using KernelAbstractions: @atomic, @atomicswap, @atomicreplace
8+
using KernelAbstractions.Extras
9+
10+
11+
12+
@kernel function FluxUpdateKernel!(Flux,@Const(u),@Const(v),@Const(w),@Const(c))
13+
14+
i,j,k = @index(Global, NTuple)
15+
16+
M = @uniform @ndrange()[1]
17+
N = @uniform @ndrange()[2]
18+
L = @uniform @ndrange()[3]
19+
20+
if 1 < i && i < M && j > 1 && j < N && k > 1 && k < L
21+
@inbounds FluxW = eltype(Flux)(0.5) * (u[i-1,j,k] - abs(u[i-1,j,k])) * c[i,j,k]
22+
@inbounds FluxE = eltype(Flux)(0.5) * (u[i,j,k] + abs(u[i,j,k])) * c[i,j,k]
23+
@inbounds FluxS = eltype(Flux)(0.5) * (v[i,j-1,k] - abs(v[i,j-1,k])) * c[i,j,k]
24+
@inbounds FluxN = eltype(Flux)(0.5) * (v[i,j,k] + abs(v[i,j,k])) * c[i,j,k]
25+
@inbounds FluxB = eltype(Flux)(0.5) * (w[i,j,k-1] - abs(w[i,j,k-1])) * c[i,j,k]
26+
@inbounds FluxT = eltype(Flux)(0.5) * (w[i,j,k] + abs(w[i,j,k])) * c[i,j,k]
27+
# @inbounds @atomic Flux[i-1,j,k] += -FluxW
28+
# @inbounds @atomic Flux[i+1,j,k] += FluxE
29+
# @inbounds @atomic Flux[i,j-1,k] += -FluxS
30+
# @inbounds @atomic Flux[i,j+1,k] += FluxN
31+
# @inbounds @atomic Flux[i,j,k-1] += -FluxB
32+
# @inbounds @atomic Flux[i,j,k+1] += FluxT
33+
ind = i + j + k
34+
@inbounds @atomic Flux[ind] += (FluxW - FluxE + FluxS - FluxN + FluxB - FluxT)
35+
end
36+
end
37+
38+
backend = CUDABackend()
39+
FT = Float32
40+
M = 100
41+
N = 101
42+
L = 102
43+
44+
group = (9, 9 ,9)
45+
ndrange = (M, N , L)
46+
47+
u = KernelAbstractions.ones(backend,FT,M+1,N,L)
48+
v = KernelAbstractions.ones(backend,FT,M,N+1,L)
49+
w = KernelAbstractions.ones(backend,FT,M,N,L+1)
50+
c = KernelAbstractions.ones(backend,FT,M,N,L)
51+
Flux = KernelAbstractions.zeros(backend,FT,M+N,L)
52+
53+
KFluxUpdateKernel! = FluxUpdateKernel!(backend,group)
54+
55+
KFluxUpdateKernel!(Flux,u,v,w,c,ndrange=ndrange)
56+
@time for iter = 1 : 100
57+
KFluxUpdateKernel!(Flux,u,v,w,c,ndrange=ndrange)
58+
end

GlobInd

675 KB
Binary file not shown.

InfoCUDAAMD

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,13 @@ include("Examples/TestHistogram.jl")
1313
CPU
1414
julia> include("Examples/TestHistogram.jl")
1515
3.429507 seconds (1.18 M allocations: 2.306 GiB, 18.29% gc time)
16+
17+
18+
NEU:
19+
DKRZ Hamburg
20+
julia> include("Examples/testAMD.jl")
21+
0.000805 seconds (5.90 k allocations: 376.562 KiB)
22+
23+
include("Examples/testAMD.jl")
24+
0.001861 seconds (10.80 k allocations: 533.594 KiB)
25+

src/DyCore/NumberingFem.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,8 @@ function NumberingFemCGQuad(Grid,PolyOrd)
153153
end
154154
end
155155
end
156+
@show size(Glob),NumG
157+
write("GlobInd",Glob)
156158

157159
return (Glob,NumG,NumI,Stencil,MasterSlave)
158160
end

0 commit comments

Comments
 (0)