Skip to content

Commit 990e3d4

Browse files
committed
AdvectionTestProblem
1 parent f84c549 commit 990e3d4

File tree

12 files changed

+185
-46
lines changed

12 files changed

+185
-46
lines changed

Examples/TestHistogram.jl

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
using CUDA
2+
using AMDGPU
3+
using KernelAbstractions, Test
4+
using KernelAbstractions: @atomic, @atomicswap, @atomicreplace
5+
6+
# Function to use as a baseline for CPU metrics
7+
function create_histogram(input)
8+
histogram_output = zeros(Int, maximum(input))
9+
for i in input
10+
histogram_output[i] += 1
11+
end
12+
return histogram_output
13+
end
14+
15+
# This a 1D histogram kernel where the histogramming happens on shmem
16+
@kernel function histogram_kernel!(histogram_output, input)
17+
tid = @index(Global, Linear)
18+
lid = @index(Local, Linear)
19+
20+
@uniform warpsize = Int(32)
21+
22+
@uniform gs = @groupsize()[1]
23+
@uniform N = length(histogram_output)
24+
25+
shared_histogram = @localmem Int (gs)
26+
27+
# This will go through all input elements and assign them to a location in
28+
# shmem. Note that if there is not enough shem, we create different shmem
29+
# blocks to write to. For example, if shmem is of size 256, but it's
30+
# possible to get a value of 312, then we will have 2 separate shmem blocks,
31+
# one from 1->256, and another from 256->512
32+
@uniform max_element = 1
33+
for min_element = 1:gs:N
34+
35+
# Setting shared_histogram to 0
36+
@inbounds shared_histogram[lid] = 0
37+
@synchronize()
38+
39+
max_element = min_element + gs
40+
if max_element > N
41+
max_element = N+1
42+
end
43+
44+
# Defining bin on shared memory and writing to it if possible
45+
bin = input[tid]
46+
if bin >= min_element && bin < max_element
47+
bin -= min_element-1
48+
@atomic shared_histogram[bin] += 1
49+
end
50+
51+
@synchronize()
52+
53+
if ((lid+min_element-1) <= N)
54+
@atomic histogram_output[lid+min_element-1] += shared_histogram[lid]
55+
end
56+
57+
end
58+
59+
end
60+
61+
function histogram!(histogram_output, input)
62+
backend = get_backend(histogram_output)
63+
# Need static block size
64+
kernel! = histogram_kernel!(backend, (256,))
65+
kernel!(histogram_output, input, ndrange=size(input))
66+
return histogram_output
67+
end
68+
69+
function move(backend, input)
70+
# TODO replace with adapt(backend, input)
71+
out = KernelAbstractions.allocate(backend, eltype(input), size(input))
72+
KernelAbstractions.copyto!(backend, out, input)
73+
return out
74+
end
75+
76+
77+
backend = CPU()
78+
#backend = CUDABackend()
79+
#backend = ROCBackend()
80+
rand_input = [rand(1:128) for i = 1:1000000]
81+
rand_input = move(backend, rand_input)
82+
83+
rand_histogram = KernelAbstractions.zeros(backend, Int, 128)
84+
histogram!(rand_histogram, rand_input)
85+
KernelAbstractions.synchronize(backend)
86+
@time begin
87+
for i = 1 : 300
88+
histogram!(rand_histogram, rand_input)
89+
KernelAbstractions.synchronize(backend)
90+
end
91+
end
92+

Examples/testAdvectionCart.jl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,13 +223,17 @@ elseif Problem == "LimAdvectionCart"
223223
end
224224

225225
U = GPU.InitialConditionsAdvection(backend,FTB,CG,Metric,Phys,Global,Profile,Param)
226+
@show maximum(abs.(U[:,:,2]))
227+
@show maximum(abs.(U[:,:,3]))
226228

227229
# Output
228230
Global.Output.vtkFileName=string(Problem*"_")
229231
Global.Output.vtk=0
230232
Global.Output.Flat=true
231233
Global.Output.H=H
232234
Global.Output.cNames = [
235+
"u",
236+
"v",
233237
"Rho",
234238
"Tr1",
235239
]

InfoCUDAAMD

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
CUDA
2+
DKRZ Hamburg, A100
3+
julia> include("Examples/TestHistogram.jl")
4+
0.013298 seconds (13.80 k allocations: 623.438 KiB)
5+
6+
7+
AMD
8+
https://lumi-supercomputer.eu/
9+
2978 nodes with 4 AMD MI250x GPUs and a single 64 cores AMD EPYC "Trento"
10+
include("Examples/TestHistogram.jl")
11+
0.508759 seconds (332.16 k allocations: 6.289 MiB)
12+
13+
CPU
14+
julia> include("Examples/TestHistogram.jl")
15+
3.429507 seconds (1.18 M allocations: 2.306 GiB, 18.29% gc time)

Jobs/JobAdvectionLimCart

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,28 @@
11
mpirun -n 6 julia --project Examples/testAdvectionCart.jl \
22
--Problem="LimAdvectionCart" \
33
--Device="CPU" \
4-
--FloatTypeBackend="Float32" \
4+
--FloatTypeBackend="Float64" \
55
--NumV=5 \
66
--NumTr=1 \
77
--HorLimit=true \
88
--Upwind=true \
99
--vtkFileName="LimAdvectionCart" \
1010
--SimTime=0.0 \
11-
--PrintTime=0.2 \
12-
--dtau=0.005 \
11+
--PrintTime=0.1 \
12+
--dtau=0.0025 \
1313
--IntMethod="SSPRungeKutta" \
1414
--Table="SSP32" \
1515
--Lx=0 \
1616
--Ly=0 \
1717
--H=0 \
1818
--x0=0 \
1919
--y0=0 \
20-
--nx=10 \
21-
--ny=10 \
22-
--nz=10 \
20+
--nx=40 \
21+
--ny=40 \
22+
--nz=1 \
2323
--OrdPoly=4 \
24-
--BoundaryWE="Period" \
25-
--BoundarySN="Period" \
24+
--BoundaryWE="" \
25+
--BoundarySN="" \
2626
--BoundaryBT="" \
27-
--HyperVisc=false \
28-
--HyperDDiv=0.e0
27+
--HyperVisc=true \
28+
--HyperDDiv=1.e-4

listRecv

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
"Recv", 1, 6, 37, 10
2+
"Recv", 1, 2, 13, 12
3+
"Recv", 5, 4, 29, 11
4+
"Recv", 5, 3, 23, 1
5+
"Recv", 5, 6, 41, 12
6+
"Recv", 6, 5, 36, 12
7+
"Recv", 6, 1, 12, 10
8+
"Recv", 2, 1, 8, 12
9+
"Recv", 2, 3, 20, 12
10+
"Recv", 4, 3, 22, 11
11+
"Recv", 4, 5, 34, 12
12+
"Recv", 3, 2, 15, 12
13+
"Recv", 3, 4, 27, 11
14+
"Recv", 3, 5, 33, 1

listSend

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
"Send", 1, 6, 12, 10
2+
"Send", 1, 2, 8, 12
3+
"Send", 5, 4, 34, 11
4+
"Send", 5, 3, 33, 1
5+
"Send", 5, 6, 36, 12
6+
"Send", 6, 5, 41, 12
7+
"Send", 6, 1, 37, 10
8+
"Send", 2, 1, 13, 12
9+
"Send", 2, 3, 15, 12
10+
"Send", 4, 3, 27, 11
11+
"Send", 4, 5, 29, 12
12+
"Send", 3, 2, 20, 12
13+
"Send", 3, 4, 22, 11
14+
"Send", 3, 5, 23, 1

src/Examples/initial.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ function (profile::LimAdvectionCartExample)(Param,Phys)
8989
u = -Param.u0 * x[2] * cospi(time / Param.end_time)
9090
v = Param.u0 * x[1] * cospi(time / Param.end_time)
9191
w = Param.u0 * sinpi(x[3] / Param.zmax) * cospi(time / Param.end_time)
92+
w = FT(0)
9293

9394
return (Rho,u,v,w,Tr)
9495
end

src/GPU/FcnGPU.jl

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -69,17 +69,19 @@ function FcnAdvectionGPU!(F,U,time,FE,Metric,Phys,Cache,Exchange,Global,Param,Pr
6969
KernelAbstractions.synchronize(backend)
7070

7171
# Hyperviscosity Part 1
72-
KHyperViscTracerKernel!(CacheTr,U[:,:,1+NumV],Rho,DS,DW,dXdxI,J,M,Glob,ndrange=ndrange)
73-
KernelAbstractions.synchronize(backend)
72+
if ~Global.Model.HorLimit
73+
KHyperViscTracerKernel!(CacheTr,U[:,:,1+NumV],Rho,DS,DW,dXdxI,J,M,Glob,ndrange=ndrange)
74+
KernelAbstractions.synchronize(backend)
7475

75-
# Data exchange
76-
Parallels.ExchangeData3DSendGPU(CacheTr,Exchange)
77-
Parallels.ExchangeData3DRecvGPU!(CacheTr,Exchange)
76+
# Data exchange
77+
Parallels.ExchangeData3DSendGPU(CacheTr,Exchange)
78+
Parallels.ExchangeData3DRecvGPU!(CacheTr,Exchange)
79+
end
7880

7981
F .= FT(0)
8082

81-
KDivRhoKernel!(F,U,DS,dXdxI,J,M,Glob,ndrange=ndrange)
82-
KernelAbstractions.synchronize(backend)
83+
#KDivRhoKernel!(F,U,DS,dXdxI,J,M,Glob,ndrange=ndrange)
84+
#KernelAbstractions.synchronize(backend)
8385

8486
if Global.Model.HorLimit
8587
@views KDivRhoTrUpwind3LimKernel!(F[:,:,1+NumV],U[:,:,1+NumV],U,DS,
@@ -368,6 +370,9 @@ function FcnGPUAMD!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,DiscType)
368370
@views CH_I = Cache.CH[:,NBF+1:NF]
369371
@views MRho = CacheF[:,:,6]
370372
@. MRho = FT(1)
373+
@show size(U)
374+
@show NF
375+
stop
371376
# Ranges
372377
NzG = min(div(NumberThreadGPU,N*N),Nz)
373378
group = (N, N, NzG, 1)

src/GPU/InitialConditions.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,8 @@ function InitialConditionsAdvection(backend,FTB,CG,Metric,Phys,Global,Profile,Pa
118118
group = (N * N, NzG, 1)
119119
ndrange = (N * N, Nz, NF)
120120

121+
@show NF
122+
121123
U = KernelAbstractions.zeros(backend,FTB,Nz,CG.NumG,NumV+NumTr)
122124
@views Rho = U[:,:,Model.RhoPos]
123125
@views u = U[:,:,Model.uPos]

src/Grids/SubGrid.jl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,6 @@ function ConstructSubGrid(GlobalGrid,Proc,ProcNumber)
143143
Dim=3;
144144
Renumbering!(Edges,Faces);
145145
FacesInNodes!(Nodes,Faces)
146-
147146
Form = GlobalGrid.Form
148147
Rad = GlobalGrid.Rad
149148
# Stencil

src/Outputs/vtkSphere.jl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,6 @@ function unstructured_vtkSphere(U,Trans,CG,Metric,Cache,Global, part::Int, npart
294294
uCell = zeros(OrdPrint*OrdPrint*nz*NF)
295295
@views InterpolateGPU!(cCell,U[:,:,uPos],vtkInter,CG.Glob)
296296
@views copyto!(uCell,reshape(cCell,OrdPrint*OrdPrint*nz*NF))
297-
# @views Interpolate!(uCell,U[:,:,uPos],vtkInter,OrdPoly,OrdPrint,CG.Glob,NF,nz)
298297
vtk["u", VTKCellData()] = uCell
299298
elseif str == "Rhou"
300299
uPos = Global.Model.uPos

src/Parallels/Exchange.jl

Lines changed: 21 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -80,17 +80,21 @@ function ExchangeStruct{FT}(backend,SubGrid,OrdPoly,CellToProc,Proc,ProcNumber,H
8080
InBoundEdgesP = zeros(Int,NumInBoundEdges)
8181
NumInBoundEdges = 0
8282
@inbounds for i = 1:SubGrid.NumEdges
83-
if CellToProc[SubGrid.Edges[i].FG[1]] == Proc
84-
else
85-
NumInBoundEdges += 1
86-
push!(InBoundEdges, i)
87-
push!(InBoundEdgesP, CellToProc[SubGrid.Edges[i].FG[1]])
88-
end
89-
if CellToProc[SubGrid.Edges[i].FG[2]] == Proc
90-
else
91-
NumInBoundEdges += 1
92-
push!(InBoundEdges, i)
93-
push!(InBoundEdgesP, CellToProc[SubGrid.Edges[i].FG[1]])
83+
if SubGrid.Edges[i].FG[1] > 0
84+
if CellToProc[SubGrid.Edges[i].FG[1]] == Proc
85+
else
86+
NumInBoundEdges += 1
87+
push!(InBoundEdges, i)
88+
push!(InBoundEdgesP, CellToProc[SubGrid.Edges[i].FG[1]])
89+
end
90+
end
91+
if SubGrid.Edges[i].FG[2] > 0
92+
if CellToProc[SubGrid.Edges[i].FG[2]] == Proc
93+
else
94+
NumInBoundEdges += 1
95+
push!(InBoundEdges, i)
96+
push!(InBoundEdgesP, CellToProc[SubGrid.Edges[i].FG[1]])
97+
end
9498
end
9599
end
96100

@@ -633,8 +637,8 @@ function ExchangeDataFSend(cFMin,cFMax,Exchange)
633637
nT = size(cFMin,3)
634638
if Exchange.InitRecvBufferF
635639
@inbounds for iP in NeiProc
636-
Exchange.RecvBufferF[iP] = zeros(nz,length(IndRecvBufferF[iP]),nT,2)
637-
Exchange.SendBufferF[iP] = zeros(nz,length(IndRecvBufferF[iP]),nT,2)
640+
Exchange.RecvBufferF[iP] = zeros(nz,length(IndRecvBufferF[iP]),2,nT)
641+
Exchange.SendBufferF[iP] = zeros(nz,length(IndSendBufferF[iP]),2,nT)
638642
end
639643
RecvBufferF = Exchange.RecvBufferF
640644
SendBufferF = Exchange.SendBufferF
@@ -653,29 +657,20 @@ function ExchangeDataFSend(cFMin,cFMax,Exchange)
653657
i = 0
654658
@views @inbounds for Ind in IndSendBufferF[iP]
655659
i += 1
656-
@views @. SendBufferF[iP][:,i,:,1] = cFMin[:,Ind,:]
657-
@views @. SendBufferF[iP][:,i,:,2] = cFMax[:,Ind,:]
660+
@views @. SendBufferF[iP][:,i,1,:] = cFMin[:,Ind,:]
661+
@views @. SendBufferF[iP][:,i,2,:] = cFMax[:,Ind,:]
658662
end
659663
end
660664
i = 0
661-
# @show size(rreq),size(sreq)
662665
@inbounds for iP in NeiProc
663666
tag = Proc + ProcNumber*iP
664667
i += 1
665-
@show "Recv",Proc,iP,tag,size(RecvBufferF[iP])
666-
if Proc == 3 && iP == 4
667-
@show IndRecvBufferF[iP]
668-
end
669668
@views MPI.Irecv!(RecvBufferF[iP], iP - 1, tag, MPI.COMM_WORLD, rreq[i])
670669
end
671670
i = 0
672671
@inbounds for iP in NeiProc
673672
tag = iP + ProcNumber*Proc
674673
i += 1
675-
@show "Send",Proc,iP,tag,size(SendBufferF[iP])
676-
if Proc == 4 && iP == 3
677-
@show IndSendBufferF[iP]
678-
end
679674
@views MPI.Isend(SendBufferF[iP], iP - 1, tag, MPI.COMM_WORLD, sreq[i])
680675
end
681676
end
@@ -694,11 +689,10 @@ function ExchangeDataFRecv!(cFMin,cFMax,Exchange)
694689
#Receive
695690
@inbounds for iP in NeiProc
696691
i = 0
697-
# @show iP,IndRecvBufferF[iP]
698692
@inbounds for Ind in IndRecvBufferF[iP]
699693
i += 1
700-
@views @. cFMin[:,Ind,:] = RecvBufferF[iP][:,i,:,1]
701-
@views @. cFMax[:,Ind,:] = RecvBufferF[iP][:,i,:,2]
694+
@views @. cFMin[:,Ind,:] = RecvBufferF[iP][:,i,1,:]
695+
@views @. cFMax[:,Ind,:] = RecvBufferF[iP][:,i,2,:]
702696
end
703697
end
704698
end

0 commit comments

Comments
 (0)