AdvectionTestProblem

OsKnoth · OsKnoth · commit 990e3d4aae3f · 2023-12-08T12:18:54.000+01:00
diff --git a/Examples/TestHistogram.jl b/Examples/TestHistogram.jl
@@ -0,0 +1,92 @@
+using CUDA
+using AMDGPU
+using KernelAbstractions, Test
+using KernelAbstractions: @atomic, @atomicswap, @atomicreplace
+
+# Function to use as a baseline for CPU metrics
+function create_histogram(input)
+    histogram_output = zeros(Int, maximum(input))
+    for i in input
+        histogram_output[i] += 1
+    end
+    return histogram_output
+end
+
+# This a 1D histogram kernel where the histogramming happens on shmem
+@kernel function histogram_kernel!(histogram_output, input)
+    tid = @index(Global, Linear)
+    lid = @index(Local, Linear)
+
+    @uniform warpsize = Int(32)
+
+    @uniform gs = @groupsize()[1]
+    @uniform N = length(histogram_output)
+
+    shared_histogram = @localmem Int (gs)
+
+    # This will go through all input elements and assign them to a location in
+    # shmem. Note that if there is not enough shem, we create different shmem
+    # blocks to write to. For example, if shmem is of size 256, but it's
+    # possible to get a value of 312, then we will have 2 separate shmem blocks,
+    # one from 1->256, and another from 256->512
+    @uniform max_element = 1
+    for min_element = 1:gs:N
+
+        # Setting shared_histogram to 0
+        @inbounds shared_histogram[lid] = 0
+        @synchronize()
+
+        max_element = min_element + gs
+        if max_element > N
+            max_element = N+1
+        end
+
+        # Defining bin on shared memory and writing to it if possible
+        bin = input[tid]
+        if bin >= min_element && bin < max_element
+            bin -= min_element-1
+            @atomic shared_histogram[bin] += 1
+        end
+
+        @synchronize()
+
+        if ((lid+min_element-1) <= N)
+            @atomic histogram_output[lid+min_element-1] += shared_histogram[lid]
+        end
+
+    end
+
+end
+
+function histogram!(histogram_output, input)
+    backend = get_backend(histogram_output)
+    # Need static block size
+    kernel! = histogram_kernel!(backend, (256,))
+    kernel!(histogram_output, input, ndrange=size(input))
+    return histogram_output
+end
+
+function move(backend, input)
+    # TODO replace with adapt(backend, input)
+    out = KernelAbstractions.allocate(backend, eltype(input), size(input))
+    KernelAbstractions.copyto!(backend, out, input)
+    return out
+end
+
+
+backend = CPU()
+#backend = CUDABackend()
+#backend = ROCBackend()
+rand_input = [rand(1:128) for i = 1:1000000]
+rand_input = move(backend, rand_input)
+
+rand_histogram = KernelAbstractions.zeros(backend, Int, 128)
+histogram!(rand_histogram, rand_input)
+KernelAbstractions.synchronize(backend)
+@time begin
+  for i = 1 : 300
+    histogram!(rand_histogram, rand_input)
+    KernelAbstractions.synchronize(backend)
+  end
+end
+
diff --git a/Examples/testAdvectionCart.jl b/Examples/testAdvectionCart.jl
@@ -223,13 +223,17 @@ elseif Problem == "LimAdvectionCart"
 end  
 
 U = GPU.InitialConditionsAdvection(backend,FTB,CG,Metric,Phys,Global,Profile,Param)
+@show maximum(abs.(U[:,:,2]))
+@show maximum(abs.(U[:,:,3]))
 
 # Output
   Global.Output.vtkFileName=string(Problem*"_")
   Global.Output.vtk=0
   Global.Output.Flat=true
   Global.Output.H=H
   Global.Output.cNames = [
+      "u",
+      "v",
       "Rho",
       "Tr1",
       ]
diff --git a/InfoCUDAAMD b/InfoCUDAAMD
@@ -0,0 +1,15 @@
+CUDA
+DKRZ Hamburg, A100
+julia> include("Examples/TestHistogram.jl")
+  0.013298 seconds (13.80 k allocations: 623.438 KiB)
+
+
+AMD
+https://lumi-supercomputer.eu/
+2978 nodes with 4 AMD MI250x GPUs and a single 64 cores AMD EPYC "Trento" 
+include("Examples/TestHistogram.jl")
+  0.508759 seconds (332.16 k allocations: 6.289 MiB)
+
+CPU
+julia> include("Examples/TestHistogram.jl")
+  3.429507 seconds (1.18 M allocations: 2.306 GiB, 18.29% gc time)
diff --git a/Jobs/JobAdvectionLimCart b/Jobs/JobAdvectionLimCart
@@ -1,28 +1,28 @@
 mpirun -n 6 julia --project Examples/testAdvectionCart.jl \
   --Problem="LimAdvectionCart" \
   --Device="CPU" \
-  --FloatTypeBackend="Float32" \
+  --FloatTypeBackend="Float64" \
   --NumV=5 \
   --NumTr=1 \
   --HorLimit=true \
   --Upwind=true \
   --vtkFileName="LimAdvectionCart" \
   --SimTime=0.0  \
-  --PrintTime=0.2 \
-  --dtau=0.005 \
+  --PrintTime=0.1 \
+  --dtau=0.0025 \
   --IntMethod="SSPRungeKutta" \
   --Table="SSP32" \
   --Lx=0 \
   --Ly=0 \
   --H=0 \
   --x0=0 \
   --y0=0 \
-  --nx=10 \
-  --ny=10 \
-  --nz=10 \
+  --nx=40 \
+  --ny=40 \
+  --nz=1 \
   --OrdPoly=4 \
-  --BoundaryWE="Period" \
-  --BoundarySN="Period" \
+  --BoundaryWE="" \
+  --BoundarySN="" \
   --BoundaryBT="" \
-  --HyperVisc=false \
-  --HyperDDiv=0.e0
+  --HyperVisc=true \
+  --HyperDDiv=1.e-4
diff --git a/listRecv b/listRecv
@@ -0,0 +1,14 @@
+"Recv", 1, 6, 37, 10
+"Recv", 1, 2, 13, 12
+"Recv", 5, 4, 29, 11
+"Recv", 5, 3, 23,  1
+"Recv", 5, 6, 41, 12
+"Recv", 6, 5, 36, 12
+"Recv", 6, 1, 12, 10
+"Recv", 2, 1, 8,  12
+"Recv", 2, 3, 20, 12
+"Recv", 4, 3, 22, 11
+"Recv", 4, 5, 34, 12
+"Recv", 3, 2, 15, 12
+"Recv", 3, 4, 27, 11
+"Recv", 3, 5, 33,  1
diff --git a/listSend b/listSend
@@ -0,0 +1,14 @@
+"Send", 1, 6, 12, 10
+"Send", 1, 2, 8,  12
+"Send", 5, 4, 34, 11
+"Send", 5, 3, 33,  1
+"Send", 5, 6, 36, 12
+"Send", 6, 5, 41, 12
+"Send", 6, 1, 37, 10
+"Send", 2, 1, 13, 12
+"Send", 2, 3, 15, 12
+"Send", 4, 3, 27, 11
+"Send", 4, 5, 29, 12
+"Send", 3, 2, 20, 12
+"Send", 3, 4, 22, 11
+"Send", 3, 5, 23,  1
diff --git a/src/Examples/initial.jl b/src/Examples/initial.jl
@@ -89,6 +89,7 @@ function (profile::LimAdvectionCartExample)(Param,Phys)
       u = -Param.u0 * x[2]  * cospi(time / Param.end_time)
       v = Param.u0 * x[1] * cospi(time / Param.end_time)
       w = Param.u0 * sinpi(x[3] / Param.zmax) * cospi(time / Param.end_time)
+      w = FT(0)
 
       return (Rho,u,v,w,Tr)
     end
diff --git a/src/GPU/FcnGPU.jl b/src/GPU/FcnGPU.jl
@@ -69,17 +69,19 @@ function FcnAdvectionGPU!(F,U,time,FE,Metric,Phys,Cache,Exchange,Global,Param,Pr
   KernelAbstractions.synchronize(backend)
 
 # Hyperviscosity Part 1
-  KHyperViscTracerKernel!(CacheTr,U[:,:,1+NumV],Rho,DS,DW,dXdxI,J,M,Glob,ndrange=ndrange)
-  KernelAbstractions.synchronize(backend)
+  if ~Global.Model.HorLimit
+     KHyperViscTracerKernel!(CacheTr,U[:,:,1+NumV],Rho,DS,DW,dXdxI,J,M,Glob,ndrange=ndrange)
+     KernelAbstractions.synchronize(backend)
 
-# Data exchange  
-  Parallels.ExchangeData3DSendGPU(CacheTr,Exchange)
-  Parallels.ExchangeData3DRecvGPU!(CacheTr,Exchange)
+#   Data exchange  
+    Parallels.ExchangeData3DSendGPU(CacheTr,Exchange)
+    Parallels.ExchangeData3DRecvGPU!(CacheTr,Exchange)
+  end  
 
   F .= FT(0)
 
-  KDivRhoKernel!(F,U,DS,dXdxI,J,M,Glob,ndrange=ndrange)
-  KernelAbstractions.synchronize(backend)  
+  #KDivRhoKernel!(F,U,DS,dXdxI,J,M,Glob,ndrange=ndrange)
+  #KernelAbstractions.synchronize(backend)  
 
   if Global.Model.HorLimit
     @views KDivRhoTrUpwind3LimKernel!(F[:,:,1+NumV],U[:,:,1+NumV],U,DS,
@@ -368,6 +370,9 @@ function FcnGPUAMD!(F,U,FE,Metric,Phys,Cache,Exchange,Global,Param,DiscType)
   @views CH_I = Cache.CH[:,NBF+1:NF]
   @views MRho = CacheF[:,:,6]
   @. MRho = FT(1)
+  @show size(U)
+  @show NF
+  stop
 # Ranges
   NzG = min(div(NumberThreadGPU,N*N),Nz)
   group = (N, N, NzG, 1)
diff --git a/src/GPU/InitialConditions.jl b/src/GPU/InitialConditions.jl
@@ -118,6 +118,8 @@ function InitialConditionsAdvection(backend,FTB,CG,Metric,Phys,Global,Profile,Pa
   group = (N * N, NzG, 1)
   ndrange = (N * N, Nz, NF)
 
+  @show NF
+
   U = KernelAbstractions.zeros(backend,FTB,Nz,CG.NumG,NumV+NumTr)
   @views Rho = U[:,:,Model.RhoPos]
   @views u = U[:,:,Model.uPos]
diff --git a/src/Grids/SubGrid.jl b/src/Grids/SubGrid.jl
@@ -143,7 +143,6 @@ function ConstructSubGrid(GlobalGrid,Proc,ProcNumber)
   Dim=3;
   Renumbering!(Edges,Faces);
   FacesInNodes!(Nodes,Faces)
-
   Form = GlobalGrid.Form
   Rad = GlobalGrid.Rad
 # Stencil  
diff --git a/src/Outputs/vtkSphere.jl b/src/Outputs/vtkSphere.jl
@@ -294,7 +294,6 @@ function unstructured_vtkSphere(U,Trans,CG,Metric,Cache,Global, part::Int, npart
       uCell = zeros(OrdPrint*OrdPrint*nz*NF)
       @views InterpolateGPU!(cCell,U[:,:,uPos],vtkInter,CG.Glob)
       @views copyto!(uCell,reshape(cCell,OrdPrint*OrdPrint*nz*NF))
-#     @views Interpolate!(uCell,U[:,:,uPos],vtkInter,OrdPoly,OrdPrint,CG.Glob,NF,nz)
       vtk["u", VTKCellData()] = uCell
     elseif  str == "Rhou" 
       uPos = Global.Model.uPos
diff --git a/src/Parallels/Exchange.jl b/src/Parallels/Exchange.jl
@@ -80,17 +80,21 @@ function ExchangeStruct{FT}(backend,SubGrid,OrdPoly,CellToProc,Proc,ProcNumber,H
   InBoundEdgesP = zeros(Int,NumInBoundEdges)
   NumInBoundEdges = 0
   @inbounds for i = 1:SubGrid.NumEdges
-    if CellToProc[SubGrid.Edges[i].FG[1]] == Proc  
-    else
-      NumInBoundEdges += 1
-      push!(InBoundEdges, i)
-      push!(InBoundEdgesP, CellToProc[SubGrid.Edges[i].FG[1]])
-    end  
-    if CellToProc[SubGrid.Edges[i].FG[2]] == Proc  
-    else
-      NumInBoundEdges += 1
-      push!(InBoundEdges, i)
-      push!(InBoundEdgesP, CellToProc[SubGrid.Edges[i].FG[1]])
+    if SubGrid.Edges[i].FG[1] > 0
+      if CellToProc[SubGrid.Edges[i].FG[1]] == Proc  
+      else
+        NumInBoundEdges += 1
+        push!(InBoundEdges, i)
+        push!(InBoundEdgesP, CellToProc[SubGrid.Edges[i].FG[1]])
+      end  
+    end
+    if SubGrid.Edges[i].FG[2] > 0
+      if CellToProc[SubGrid.Edges[i].FG[2]] == Proc  
+      else
+        NumInBoundEdges += 1
+        push!(InBoundEdges, i)
+        push!(InBoundEdgesP, CellToProc[SubGrid.Edges[i].FG[1]])
+      end  
     end  
   end
 
@@ -633,8 +637,8 @@ function ExchangeDataFSend(cFMin,cFMax,Exchange)
   nT = size(cFMin,3)
   if Exchange.InitRecvBufferF
     @inbounds for iP in NeiProc
-      Exchange.RecvBufferF[iP] = zeros(nz,length(IndRecvBufferF[iP]),nT,2)
-      Exchange.SendBufferF[iP] = zeros(nz,length(IndRecvBufferF[iP]),nT,2)
+      Exchange.RecvBufferF[iP] = zeros(nz,length(IndRecvBufferF[iP]),2,nT)
+      Exchange.SendBufferF[iP] = zeros(nz,length(IndSendBufferF[iP]),2,nT)
     end
     RecvBufferF = Exchange.RecvBufferF
     SendBufferF = Exchange.SendBufferF
@@ -653,29 +657,20 @@ function ExchangeDataFSend(cFMin,cFMax,Exchange)
     i = 0
     @views @inbounds for Ind in IndSendBufferF[iP]
       i += 1
-      @views @. SendBufferF[iP][:,i,:,1] = cFMin[:,Ind,:]
-      @views @. SendBufferF[iP][:,i,:,2] = cFMax[:,Ind,:]
+      @views @. SendBufferF[iP][:,i,1,:] = cFMin[:,Ind,:]
+      @views @. SendBufferF[iP][:,i,2,:] = cFMax[:,Ind,:]
     end
   end
   i = 0
-# @show size(rreq),size(sreq)
   @inbounds for iP in NeiProc
     tag = Proc + ProcNumber*iP
     i += 1
-    @show "Recv",Proc,iP,tag,size(RecvBufferF[iP])
-    if Proc == 3 && iP == 4
-      @show IndRecvBufferF[iP]  
-    end  
     @views MPI.Irecv!(RecvBufferF[iP], iP - 1, tag, MPI.COMM_WORLD, rreq[i])
   end
   i = 0
   @inbounds for iP in NeiProc
     tag = iP + ProcNumber*Proc
     i += 1
-    @show "Send",Proc,iP,tag,size(SendBufferF[iP])
-    if Proc == 4 && iP == 3
-      @show IndSendBufferF[iP]  
-    end  
     @views MPI.Isend(SendBufferF[iP], iP - 1, tag, MPI.COMM_WORLD, sreq[i])
   end
 end
@@ -694,11 +689,10 @@ function ExchangeDataFRecv!(cFMin,cFMax,Exchange)
   #Receive
   @inbounds for iP in NeiProc
     i = 0
-#   @show iP,IndRecvBufferF[iP]
     @inbounds for Ind in IndRecvBufferF[iP]
       i += 1
-      @views @. cFMin[:,Ind,:] = RecvBufferF[iP][:,i,:,1]
-      @views @. cFMax[:,Ind,:] = RecvBufferF[iP][:,i,:,2]
+      @views @. cFMin[:,Ind,:] = RecvBufferF[iP][:,i,1,:]
+      @views @. cFMax[:,Ind,:] = RecvBufferF[iP][:,i,2,:]
     end
   end
 end