diff --git a/Jobs/JobAdvectionLimCart b/Jobs/JobAdvectionLimCart index be9b508..d2d2c24 100755 --- a/Jobs/JobAdvectionLimCart +++ b/Jobs/JobAdvectionLimCart @@ -1,4 +1,4 @@ -mpirun -n 6 julia --project Examples/testAdvectionCart.jl \ +mpirun -n 2 julia --project Examples/testAdvectionCart.jl \ --Problem="LimAdvectionCart" \ --Device="CPU" \ --FloatTypeBackend="Float64" \ diff --git a/Jobs/JobAdvectionLimCart4 b/Jobs/JobAdvectionLimCart4 new file mode 100755 index 0000000..6df9548 --- /dev/null +++ b/Jobs/JobAdvectionLimCart4 @@ -0,0 +1,31 @@ +export UCX_ERROR_SIGNALS="" +srun -n 4 gpu_wrapper.sh -n 4 -e "julia --project Examples/testAdvectionCart.jl \ + --Problem="LimAdvectionCart" \ + --Device="GPU" \ + --GPUType="CUDA" \ + --NumberThreadGPU=1024 \ + --FloatTypeBackend="Float64" \ + --NumV=5 \ + --NumTr=1 \ + --HorLimit=true \ + --Upwind=true \ + --vtkFileName="LimAdvectionCart" \ + --SimTime=0.0 \ + --PrintTime=0.1 \ + --dtau=0.0025 \ + --IntMethod="SSPRungeKutta" \ + --Table="SSP32" \ + --Lx=0 \ + --Ly=0 \ + --H=0 \ + --x0=0 \ + --y0=0 \ + --nx=40 \ + --ny=40 \ + --nz=40 \ + --OrdPoly=4 \ + --BoundaryWE="Period" \ + --BoundarySN="Period" \ + --BoundaryBT="" \ + --HyperVisc=true \ + --HyperDDiv=1.e-4" diff --git a/Jobs/JobNHBaroWaveDrySphere4 b/Jobs/JobNHBaroWaveDrySphere4 new file mode 100755 index 0000000..36a01e7 --- /dev/null +++ b/Jobs/JobNHBaroWaveDrySphere4 @@ -0,0 +1,44 @@ +export UCX_ERROR_SIGNALS="" +srun -n 4 gpu_wrapper.sh -n 4 -e "julia --project Examples/testNHSphere.jl \ + --Problem="BaroWaveDrySphere" \ + --Device="GPU" \ + --GPUType="CUDA" \ + --FloatTypeBackend="Float32" \ + --NumberThreadGPU=1024 \ + --NumV=5 \ + --NumTr=0 \ + --ProfpBGrd="" \ + --ProfRhoBGrd="" \ + --Source=false \ + --Forcing=false \ + --Curl=false \ + --ModelType="VectorInvariant" \ + --Coriolis=true \ + --VerticalDiffusion=false \ + --Upwind=true \ + --HorLimit=false \ + --Buoyancy=true \ + --Decomp="EqualArea" \ + --SimDays=10 \ + --SimSeconds=0 \ + --PrintSeconds=0 \ + --PrintMinutes=0 \ + --PrintHours=0 \ + --PrintDays=0 \ + --StartAverageDays=100 \ + --Flat=true \ + --dtau=150 \ + --IntMethod="Rosenbrock" \ + --Table="SSP-Knoth" \ + --TopoS="" \ + --Stretch=true \ + --StretchType="Exp" \ + --GridType="CubedSphere" \ + --nz=64 \ + --nPanel=30 \ + --H=30000.0 \ + --OrdPoly=3 \ + --HyperVisc=true \ + --HyperDCurl=5.e14 \ + --HyperDGrad=5.e14 \ + --HyperDDiv=5.e14" diff --git a/src/Examples/parameters.jl b/src/Examples/parameters.jl index 9a8dcc6..7197078 100644 --- a/src/Examples/parameters.jl +++ b/src/Examples/parameters.jl @@ -244,28 +244,26 @@ Base.@kwdef struct LimAdvectionCart{FT} u0::FT = π / 2 # angular velocity r0::FT = (xmax - xmin) / 6 # bells radius end_time::FT = 2π # simulation period in seconds - centers1xC = xmin + (xmax - xmin) / 4 - centers1yC = ymin + (ymax - ymin) / 2 - centers1zC = zmin + (zmax - zmin) / 2 - centers2xC = xmin + 3 * (xmax - xmin) / 4 - centers2yC = ymin + (ymax - ymin) / 2 - centers2zC = zmin + (zmax - zmin) / 2 + centers1xC::FT = xmin + (xmax - xmin) / 4 + centers1yC::FT = ymin + (ymax - ymin) / 2 + centers1zC::FT = zmin + (zmax - zmin) / 2 + centers2xC::FT = xmin + 3 * (xmax - xmin) / 4 + centers2yC::FT = ymin + (ymax - ymin) / 2 + centers2zC::FT = zmin + (zmax - zmin) / 2 end -Base.@kwdef struct ParamAdvectionCubeRotCart{FT} +Base.@kwdef struct ParamAdvectionCubeRotCart StreamFun::Bool = false - uMax::FT = 1.0 - vMax::FT = 1.0 - xC::FT = 500.0 - zC::FT = 500.0 - x1::FT = 299.0 - x2::FT = 501.0 - y1::FT = 299.0 - y2::FT = 501.0 - z1::FT = 299.0 - z2::FT = 501.0 - EndTime::FT = 1000.0 - H::FT = 1000.0 + uMax::Float64 = 1.0 + vMax::Float64 = 0.0 + xC::Float64 = 500.0 + zC::Float64 = 500.0 + x1::Float64 = 299.0 + x2::Float64 = 501.0 + z1::Float64 = 299.0 + z2::Float64 = 501.0 + EndTime::Float64 = 1000.0 + H::Float64 = 1000.0 end Base.@kwdef struct ParamAdvectionCart @@ -315,10 +313,10 @@ function Parameters(FT,Problem::String) Param = ParamAdvectionCubeCart() elseif Problem == "AdvectionCubeRotCart" @show Problem - Param = ParamAdvectionCubeRotCart{FT}() + Param = ParamAdvectionCubeRotCart() elseif Problem == "LimAdvectionCart" @show Problem - Param = LimAdvectionCart{FT}() + Param = LimAdvectionCart{FT}() elseif Problem == "WarmBubble2DXCart" @show Problem Param = ParamWarmBubble2DXCart() diff --git a/src/GPU/FcnGPU.jl b/src/GPU/FcnGPU.jl index 1ff33e5..a5164a4 100644 --- a/src/GPU/FcnGPU.jl +++ b/src/GPU/FcnGPU.jl @@ -59,8 +59,8 @@ function FcnAdvectionGPU!(F,U,time,FE,Metric,Phys,Cache,Exchange,Global,Param,Pr if Global.Model.HorLimit @views KLimitKernel!(DoF,qMin,qMax,U[:,:,NumV+1:NumV+NumTr],Rho,Glob,ndrange=ndrangeL) KernelAbstractions.synchronize(backend) - Parallels.ExchangeDataFSend(qMin,qMax,Exchange) - Parallels.ExchangeDataFRecv!(qMin,qMax,Exchange) + Parallels.ExchangeDataFSendGPU(qMin,qMax,Exchange) + Parallels.ExchangeDataFRecvGPU!(qMin,qMax,Exchange) end diff --git a/src/Parallels/Exchange.jl b/src/Parallels/Exchange.jl index 4121d12..57aa03a 100644 --- a/src/Parallels/Exchange.jl +++ b/src/Parallels/Exchange.jl @@ -1,10 +1,11 @@ mutable struct ExchangeStruct{FT<:AbstractFloat, IT1<:AbstractArray, - AT3<:AbstractArray} + AT3<:AbstractArray, + AT4<:AbstractArray} IndSendBuffer::Dict{Int,IT1} - IndSendBufferF::Dict{Int,Array{Int,1}} + IndSendBufferF::Dict{Int,IT1} IndRecvBuffer::Dict{Int,IT1} - IndRecvBufferF::Dict{Int,Array{Int,1}} + IndRecvBufferF::Dict{Int,IT1} NeiProc::Array{Int, 1} Proc::Int ProcNumber::Int @@ -13,13 +14,13 @@ mutable struct ExchangeStruct{FT<:AbstractFloat, SendBuffer::Dict #SendBuffer3::Dict{Int,Array{FT, 3}} SendBuffer3::Dict{Int,AT3} - SendBufferF::Dict{Int,Array{FT, 4}} + SendBufferF::Dict{Int,AT4} InitRecvBuffer::Bool InitRecvBufferF::Bool RecvBuffer::Dict # RecvBuffer3::Dict{Int,Array{FT, 3}} RecvBuffer3::Dict{Int,AT3} - RecvBufferF::Dict{Int,Array{FT, 4}} + RecvBufferF::Dict{Int,AT4} sreq::MPI.UnsafeMultiRequest rreq::MPI.UnsafeMultiRequest end @@ -45,10 +46,12 @@ function ExchangeStruct{FT}(backend) where FT<:AbstractFloat sreq = MPI.UnsafeMultiRequest(0) rreq = MPI.UnsafeMultiRequest(0) AT3 = KernelAbstractions.zeros(backend,FT,0,0,0) + AT4 = KernelAbstractions.zeros(backend,FT,0,0,0,0) IT1 = KernelAbstractions.zeros(backend,Int,0) return ExchangeStruct{FT, typeof(IT1), - typeof(AT3)}( + typeof(AT3), + typeof(AT4)}( IndSendBuffer, IndSendBufferF, IndRecvBuffer, @@ -358,6 +361,7 @@ function ExchangeStruct{FT}(backend,SubGrid,OrdPoly,CellToProc,Proc,ProcNumber,H # Copy from CPU to device AT3 = KernelAbstractions.zeros(backend,FT,0,0,0) + AT4 = KernelAbstractions.zeros(backend,FT,0,0,0,0) IT1 = KernelAbstractions.zeros(backend,Int,0) SendBuffer = Dict() @@ -374,7 +378,8 @@ function ExchangeStruct{FT}(backend,SubGrid,OrdPoly,CellToProc,Proc,ProcNumber,H return ExchangeStruct{FT, typeof(IT1), - typeof(AT3)}( + typeof(AT3), + typeof(AT4)}( SendBuffer, IndSendBufferF, RecvBuffer, @@ -675,6 +680,75 @@ function ExchangeDataFSend(cFMin,cFMax,Exchange) end end +function ExchangeDataFSendGPU(cFMin,cFMax,Exchange) + backend = get_backend(cFMin) + FT = eltype(cFMin) + IndSendBufferF = Exchange.IndSendBufferF + IndRecvBufferF = Exchange.IndRecvBufferF + NeiProc = Exchange.NeiProc + Proc = Exchange.Proc + ProcNumber = Exchange.ProcNumber + nz = size(cFMin,1) + nT = size(cFMin,3) + if Exchange.InitRecvBufferF + @inbounds for iP in NeiProc + Exchange.RecvBufferF[iP] = zeros(nz,length(IndRecvBufferF[iP]),2,nT) + Exchange.SendBufferF[iP] = zeros(nz,length(IndSendBufferF[iP]),2,nT) + end + RecvBufferF = Exchange.RecvBufferF + SendBufferF = Exchange.SendBufferF + Exchange.InitRecvBufferF = false + Exchange.InitSendBufferF = false + rreq = Exchange.rreq + sreq = Exchange.sreq + else + RecvBufferF = Exchange.RecvBufferF + SendBufferF = Exchange.SendBufferF + rreq = Exchange.rreq + sreq = Exchange.sreq + end + + group = (nz,5,1) + KExchangeDataFSendKernel! = ExchangeDataFSendKernel!(backend,group) + @inbounds for iP in NeiProc + ndrange = (nz,length(IndSendBufferF[iP]),nT) + KExchangeDataFSendKernel!(cFMin,cFMax,SendBufferF[iP],IndSendBufferF[iP],ndrange=ndrange) + end +# @inbounds for iP in NeiProc +# i = 0 +# @views @inbounds for Ind in IndSendBufferF[iP] +# i += 1 +# @views @. SendBufferF[iP][:,i,1,:] = cFMin[:,Ind,:] +# @views @. SendBufferF[iP][:,i,2,:] = cFMax[:,Ind,:] +# end +# end + i = 0 + @inbounds for iP in NeiProc + tag = Proc + ProcNumber*iP + i += 1 + @views MPI.Irecv!(RecvBufferF[iP], iP - 1, tag, MPI.COMM_WORLD, rreq[i]) + end + i = 0 + @inbounds for iP in NeiProc + tag = iP + ProcNumber*Proc + i += 1 + @views MPI.Isend(SendBufferF[iP], iP - 1, tag, MPI.COMM_WORLD, sreq[i]) + end +end + +@kernel function ExchangeDataFSendKernel!(cFMin,cFMax,SendBufferF,IndSendBufferF) + + Iz,I,IT = @index(Global, NTuple) + NumInd = @uniform @ndrange()[2] + NT = @uniform @ndrange()[3] + + if I <= NumInd && IT <= NT + @inbounds Ind = IndSendBufferF[I] + @inbounds SendBufferF[Iz,I,1,IT] = cFMin[Iz,Ind,IT] + @inbounds SendBufferF[Iz,I,2,IT] = cFMax[Iz,Ind,IT] + end +end + function ExchangeDataFRecv!(cFMin,cFMax,Exchange) IndRecvBufferF = Exchange.IndRecvBufferF @@ -697,6 +771,52 @@ function ExchangeDataFRecv!(cFMin,cFMax,Exchange) end end +function ExchangeDataFRecvGPU!(cFMin,cFMax,Exchange) + backend = get_backend(cFMin) + FT = eltype(cFMin) + + IndRecvBufferF = Exchange.IndRecvBufferF + NeiProc = Exchange.NeiProc + RecvBufferF = Exchange.RecvBufferF + rreq = Exchange.rreq + sreq = Exchange.sreq + + stats = MPI.Waitall(rreq) + stats = MPI.Waitall(sreq) + MPI.Barrier(MPI.COMM_WORLD) + Nz = size(cFMin,1) + nT = size(cFMin,4) + group = (Nz,5,1) + KExchangeDataFRecvKernel! = ExchangeDataFRecvKernel!(backend,group) + + #Receive + @inbounds for iP in NeiProc + ndrange = (Nz,length(IndRecvBufferF[iP]),nT) + KExchangeDataFRecvKernel!(cFMin,cFMax,RecvBufferF[iP],IndRecvBufferF[iP],ndrange=ndrange) + end +# #Receive +# @inbounds for iP in NeiProc +# i = 0 +# @inbounds for Ind in IndRecvBufferF[iP] +# i += 1 +# @views @. cFMin[:,Ind,:] = RecvBufferF[iP][:,i,1,:] +# @views @. cFMax[:,Ind,:] = RecvBufferF[iP][:,i,2,:] +# end +# end +end + +@kernel function ExchangeDataFRecvKernel!(cFMin,cFMax,RecvBufferF,IndRecvBufferF) + + Iz,I,IT = @index(Global, NTuple) + NumInd = @uniform @ndrange()[2] + NT = @uniform @ndrange()[3] + if I <= NumInd && IT <= NT + @inbounds Ind = IndRecvBufferF[I] + cFMin[Iz,Ind,IT] = RecvBufferF[Iz,I,1,IT] + cFMax[Iz,Ind,IT] = RecvBufferF[Iz,I,2,IT] + end +end + function InitExchangeData3D(backend,FT,nz,nT,Exchange) IndSendBuffer = Exchange.IndSendBuffer IndRecvBuffer = Exchange.IndRecvBuffer