@@ -7,51 +7,130 @@ using KernelAbstractions
7
7
using KernelAbstractions: @atomic , @atomicswap , @atomicreplace
8
8
using KernelAbstractions. Extras
9
9
10
+ @kernel function MomentumKernel! (F,@Const (U),@Const (D),@Const (dXdxI),
11
+ @Const (MRho),@Const (M),@Const (Glob))
10
12
13
+ I, J, iz = @index (Local, NTuple)
14
+ _,_,Iz,IF = @index (Global, NTuple)
11
15
12
- @kernel function FluxUpdateKernel! (Flux,@Const (u),@Const (v),@Const (w),@Const (c))
13
-
14
- i,j,k = @index (Global, NTuple)
15
-
16
- M = @uniform @ndrange ()[1 ]
17
- N = @uniform @ndrange ()[2 ]
18
- L = @uniform @ndrange ()[3 ]
19
-
20
- if 1 < i && i < M && j > 1 && j < N && k > 1 && k < L
21
- @inbounds FluxW = eltype (Flux)(0.5 ) * (u[i- 1 ,j,k] - abs (u[i- 1 ,j,k])) * c[i,j,k]
22
- @inbounds FluxE = eltype (Flux)(0.5 ) * (u[i,j,k] + abs (u[i,j,k])) * c[i,j,k]
23
- @inbounds FluxS = eltype (Flux)(0.5 ) * (v[i,j- 1 ,k] - abs (v[i,j- 1 ,k])) * c[i,j,k]
24
- @inbounds FluxN = eltype (Flux)(0.5 ) * (v[i,j,k] + abs (v[i,j,k])) * c[i,j,k]
25
- @inbounds FluxB = eltype (Flux)(0.5 ) * (w[i,j,k- 1 ] - abs (w[i,j,k- 1 ])) * c[i,j,k]
26
- @inbounds FluxT = eltype (Flux)(0.5 ) * (w[i,j,k] + abs (w[i,j,k])) * c[i,j,k]
27
- @inbounds @atomic Flux[i- 1 ,j,k] += - FluxW
28
- @inbounds @atomic Flux[i+ 1 ,j,k] += FluxE
29
- @inbounds @atomic Flux[i,j- 1 ,k] += - FluxS
30
- @inbounds @atomic Flux[i,j+ 1 ,k] += FluxN
31
- @inbounds @atomic Flux[i,j,k- 1 ] += - FluxB
32
- @inbounds @atomic Flux[i,j,k+ 1 ] += FluxT
33
- @inbounds @atomic Flux[i,j,k] += (FluxW - FluxE + FluxS - FluxN + FluxB - FluxT)
16
+ ColumnTilesDim = @uniform @groupsize ()[3 ]
17
+ N = @uniform @groupsize ()[1 ]
18
+ Nz = @uniform @ndrange ()[3 ]
19
+ NF = @uniform @ndrange ()[4 ]
20
+
21
+ ID = I + (J - 1 ) * N
22
+ @inbounds ind = Glob[ID,IF]
23
+
24
+ RhoCol = @localmem eltype (F) (N,N,ColumnTilesDim)
25
+ uCol = @localmem eltype (F) (N,N,ColumnTilesDim)
26
+ vCol = @localmem eltype (F) (N,N,ColumnTilesDim)
27
+ wCol = @localmem eltype (F) (N,N,ColumnTilesDim+ 1 )
28
+
29
+ if Iz <= Nz
30
+ @inbounds RhoCol[I,J,iz] = U[Iz,ind,1 ]
31
+ @inbounds uCol[I,J,iz] = U[Iz,ind,2 ]
32
+ @inbounds vCol[I,J,iz] = U[Iz,ind,3 ]
33
+ @inbounds wCol[I,J,iz+ 1 ] = U[Iz,ind,4 ]
34
+ if Iz == 1
35
+ wCol[I,J,1 ] = - (dXdxI[3 ,1 ,1 ,ID,1 ,IF] * U[Iz,ind,2 ] +
36
+ dXdxI[3 ,2 ,1 ,ID,1 ,IF] * U[Iz,ind,3 ]) / dXdxI[3 ,3 ,1 ,ID,1 ,IF]
37
+ elseif iz == 1
38
+ wCol[I,J,1 ] = U[Iz- 1 ,ind,4 ]
39
+ end
34
40
end
35
- end
36
41
37
- backend = CUDABackend ()
38
- FT = Float32
39
- M = 100
40
- N = 101
41
- L = 102
42
+ @synchronize
42
43
43
- group = (9 , 9 ,9 )
44
- ndrange = (M, N , L)
44
+ ID = I + (J - 1 ) * N
45
+ @inbounds ind = Glob[ID,IF]
46
+
47
+ if Iz <= Nz
48
+ @inbounds ind = Glob[ID,IF]
49
+ @inbounds uCon1 = - RhoCol[I,J,iz] * (dXdxI[1 ,1 ,1 ,ID,Iz,IF] * uCol[I,J,iz] +
50
+ dXdxI[1 ,2 ,1 ,ID,Iz,IF] * vCol[I,J,iz] + dXdxI[1 ,3 ,1 ,ID,Iz,IF] * wCol[I,J,iz])
51
+ @inbounds uCon2 = - RhoCol[I,J,iz] * (dXdxI[1 ,1 ,2 ,ID,Iz,IF] * uCol[I,J,iz] +
52
+ dXdxI[1 ,2 ,2 ,ID,Iz,IF] * vCol[I,J,iz] + dXdxI[1 ,3 ,2 ,ID,Iz,IF] * wCol[I,J,iz+ 1 ])
53
+ @inbounds vCon1 = - RhoCol[I,J,iz] * (dXdxI[2 ,1 ,1 ,ID,Iz,IF] * uCol[I,J,iz] +
54
+ dXdxI[2 ,2 ,1 ,ID,Iz,IF] * vCol[I,J,iz] + dXdxI[2 ,3 ,1 ,ID,Iz,IF] * wCol[I,J,iz])
55
+ @inbounds vCon2 = - RhoCol[I,J,iz] * (dXdxI[2 ,1 ,2 ,ID,Iz,IF] * uCol[I,J,iz] +
56
+ dXdxI[2 ,2 ,2 ,ID,Iz,IF] * vCol[I,J,iz] + dXdxI[2 ,3 ,2 ,ID,Iz,IF] * wCol[I,J,iz+ 1 ])
57
+ @inbounds wCon1 = - RhoCol[I,J,iz] * (dXdxI[3 ,1 ,1 ,ID,Iz,IF] * uCol[I,J,iz] +
58
+ dXdxI[3 ,2 ,1 ,ID,Iz,IF] * vCol[I,J,iz] + dXdxI[3 ,3 ,1 ,ID,Iz,IF] * wCol[I,J,iz])
59
+ @inbounds wCon2 = - RhoCol[I,J,iz] * (dXdxI[3 ,1 ,2 ,ID,Iz,IF] * uCol[I,J,iz] +
60
+ dXdxI[3 ,2 ,2 ,ID,Iz,IF] * vCol[I,J,iz] + dXdxI[3 ,3 ,2 ,ID,Iz,IF] * wCol[I,J,iz+ 1 ])
61
+
62
+ @inbounds Dxu = D[I,1 ] * uCol[1 ,J,iz]
63
+ @inbounds Dyu = D[J,1 ] * uCol[I,1 ,iz]
64
+ @inbounds Dxv = D[I,1 ] * vCol[1 ,J,iz]
65
+ @inbounds Dyv = D[J,1 ] * vCol[I,1 ,iz]
66
+ @inbounds Dxw1 = D[I,1 ] * wCol[1 ,J,iz]
67
+ @inbounds Dyw1 = D[J,1 ] * wCol[I,1 ,iz]
68
+ @inbounds Dxw2 = D[I,1 ] * wCol[1 ,J,iz+ 1 ]
69
+ @inbounds Dyw2 = D[J,1 ] * wCol[I,1 ,iz+ 1 ]
70
+ Izp = min (Iz+ 1 ,Nz)
71
+ Izm = max (Iz- 1 ,1 )
72
+ ind = Glob[ID,IF]
73
+ Dzu2 = eltype (F)(0.5 ) * (U[Izp,ind,2 ] - uCol[I,J,iz])
74
+ Dzv2 = eltype (F)(0.5 ) * (U[Izp,ind,3 ] - vCol[I,J,iz])
75
+ Dzu1 = eltype (F)(0.5 ) * (uCol[I,J,iz] - U[Izm,ind,2 ])
76
+ Dzv1 = eltype (F)(0.5 ) * (vCol[I,J,iz] - U[Izm,ind,3 ])
77
+ Dzw = eltype (F)(0.5 ) * (wCol[I,J,iz+ 1 ] - wCol[I,J,iz])
78
+ for k = 2 : N
79
+ @inbounds Dxu += D[I,k] * uCol[k,J,iz]
80
+ @inbounds Dyu += D[J,k] * uCol[I,k,iz]
81
+ @inbounds Dxv += D[I,k] * vCol[k,J,iz]
82
+ @inbounds Dyv += D[J,k] * vCol[I,k,iz]
83
+ @inbounds Dxw1 += D[I,k] * wCol[k,J,iz]
84
+ @inbounds Dyw1 += D[J,k] * wCol[I,k,iz]
85
+ @inbounds Dxw2 += D[I,k] * wCol[k,J,iz+ 1 ]
86
+ @inbounds Dyw2 += D[J,k] * wCol[I,k,iz+ 1 ]
87
+ end
88
+
89
+ @inbounds @atomic F[Iz,ind,2 ] += ((uCon1 + uCon2) * Dxu + (vCon1 + vCon2) * Dyu +
90
+ wCon1 * Dzu1 + wCon2 * Dzu2) / M[Iz,ind] / RhoCol[I,J,iz]
91
+ @inbounds @atomic F[Iz,ind,3 ] += ((uCon1 + uCon2) * Dxv + (vCon1 + vCon2) * Dyv +
92
+ wCon1 * Dzv1 + wCon2 * Dzv2) / M[Iz,ind] / RhoCol[I,J,iz]
93
+ end
94
+ if Iz > 1
95
+ @inbounds @atomic F[Iz- 1 ,ind,4 ] += (uCon1 * Dxw1 + vCon1 * Dyw1 + wCon1 * Dzw) / MRho[Iz- 1 ,ind]
96
+ end
97
+ if Iz < Nz
98
+ @inbounds @atomic F[Iz,ind,4 ] += (uCon2 * Dxw2 + vCon2 * Dyw2 + wCon2 * Dzw) / MRho[Iz,ind]
99
+ end
100
+ end
101
+
102
+ NF = 5400 # 30*30*6
103
+ NumG = 48602
104
+ Ord = 4
105
+ DoF = Ord * Ord
106
+ NumV = 5
107
+ NumTr = 2
108
+ Nz = 64
109
+ NumberThreadGPU = 1024
110
+ GlobCPU = zeros (Int,DoF,NF)
111
+ read! (" GlobInd" ,GlobCPU)
112
+
113
+
114
+
115
+ backend = CPU ()
116
+ # backend = CUDABackend()
117
+ FT = Float32
118
+ NzG = min (div (NumberThreadGPU,DoF),Nz)
119
+ group = (Ord, Ord, NzG, 1 )
120
+ ndrange = (Ord, Ord, Nz, NF)
45
121
46
- u = KernelAbstractions. ones (backend,FT,M+ 1 ,N,L)
47
- v = KernelAbstractions. ones (backend,FT,M,N+ 1 ,L)
48
- w = KernelAbstractions. ones (backend,FT,M,N,L+ 1 )
49
- c = KernelAbstractions. ones (backend,FT,M,N,L)
50
- Flux = KernelAbstractions. zeros (backend,FT,M,N,L)
122
+ F = KernelAbstractions. ones (backend,FT,Nz,NumG,NumV + NumTr)
123
+ U = KernelAbstractions. ones (backend,FT,Nz,NumG,NumV + NumTr)
124
+ D = KernelAbstractions. ones (backend,FT,4 ,4 )
125
+ dXdxI = KernelAbstractions. ones (backend,FT,3 ,3 ,2 ,DoF,Nz,NF)
126
+ MRho = KernelAbstractions. ones (backend,FT,Nz- 1 ,NumG)
127
+ M = KernelAbstractions. ones (backend,FT,Nz,NumG)
128
+ Glob = KernelAbstractions. zeros (backend,Int,DoF,NF)
129
+ copyto! (Glob,GlobCPU)
51
130
52
- KFluxUpdateKernel ! = FluxUpdateKernel ! (backend,group)
131
+ KMomentumKernel ! = MomentumKernel ! (backend,group)
53
132
54
- KFluxUpdateKernel! (Flux,u,v,w,c ,ndrange= ndrange)
133
+ KMomentumKernel! (F,U,D,dXdxI,MRho,M,Glob ,ndrange= ndrange)
55
134
@time for iter = 1 : 100
56
- KFluxUpdateKernel! (Flux,u,v,w,c ,ndrange= ndrange)
135
+ KMomentumKernel! (F,U,D,dXdxI,MRho,M,Glob ,ndrange= ndrange)
57
136
end
0 commit comments