From 5c88f269fb20cf514970b5b1ed91169e67c0368e Mon Sep 17 00:00:00 2001 From: Zhen Wu Date: Mon, 6 May 2024 16:00:15 -0400 Subject: [PATCH] use atomci in KA --- src/Diagnostics/diagnostics_kernels.jl | 20 ++--- src/Plankton/CarbonMode/consume_loss.jl | 50 ++++-------- .../MacroMolecularMode/consume_loss.jl | 78 ++++++------------- src/Plankton/QuotaMode/consume_loss.jl | 78 ++++++------------- src/Plankton/utils.jl | 24 ++---- 5 files changed, 76 insertions(+), 174 deletions(-) diff --git a/src/Diagnostics/diagnostics_kernels.jl b/src/Diagnostics/diagnostics_kernels.jl index dd36ebca..75cf9cc1 100644 --- a/src/Diagnostics/diagnostics_kernels.jl +++ b/src/Diagnostics/diagnostics_kernels.jl @@ -1,23 +1,15 @@ ##### ##### record diagnostics of plankton processes at each time step ##### - -function gpu_diags_proc_kernel!(diags_proc, proc, ac, x, y, z) - index = (blockIdx().x - 1) * blockDim().x + threadIdx().x - stride = blockDim().x * gridDim().x - for i = index:stride:size(ac,1) - @inbounds CUDA.@atomic diags_proc[x[i], y[i], z[i]] += proc[i] * ac[i] - end +function diags_proc_kernel!(diags_proc, proc, ac, x, y, z) + i = @index(Global) + @inbounds KernelAbstractions.@atomic diags_proc[x[i], y[i], z[i]] += proc[i] * ac[i] end -function diags_proc!(diags_proc, proc, ac, x, y, z, ::GPU) - @cuda threads=256 blocks=ceil(Int, size(ac,1)/256) gpu_diags_proc_kernel!(diags_proc, proc, ac, x, y, z) +function diags_proc!(diags_proc, proc, ac, x, y, z, arch) + kernel! = diags_proc_kernel!(device(arch), 256, (size(ac,1))) + kernel!(diags_proc, proc, ac, x, y, z) return nothing end -function diags_proc!(diags_proc, proc, ac, x, y, z, ::CPU) - for i in 1:size(ac,1) - @inbounds diags_proc[x[i], y[i], z[i]] += proc[i] * ac[i] - end -end function diags_spcs!(diags_sp, plank, ac, x, y, z, mode::AbstractMode, arch::Architecture) diags = (:PS, :resp, :Bm, :Chl, :Th) diff --git a/src/Plankton/CarbonMode/consume_loss.jl b/src/Plankton/CarbonMode/consume_loss.jl index 902f963a..c5962f2f 100644 --- a/src/Plankton/CarbonMode/consume_loss.jl +++ b/src/Plankton/CarbonMode/consume_loss.jl @@ -1,42 +1,22 @@ ##### deal with nutrients uptake -function gpu_calc_consume_kernel!(ctsdic, plank, ac, x, y, z, ΔT) - index = (blockIdx().x - 1) * blockDim().x + threadIdx().x - stride = blockDim().x * gridDim().x - for i = index:stride:size(ac,1) - @inbounds CUDA.@atomic ctsdic[x[i], y[i], z[i]] += (plank.RS[i] - plank.PS[i]) * ΔT * ac[i] - end - return nothing +function calc_consume_kernel!(ctsdic, plank, ac, x, y, z, ΔT) + i = @index(Global) + @inbounds KernelAbstractions.@atomic ctsdic[x[i], y[i], z[i]] += (plank.RS[i] - plank.PS[i]) * ΔT * ac[i] end -function calc_consume!(ctsdic, plank, ac, x, y, z, ΔT, ::GPU) - @cuda threads=256 blocks=ceil(Int, size(ac,1)/256) gpu_calc_consume_kernel!(ctsdic, plank, ac, x, y, z, ΔT) +function calc_consume!(ctsdic, plank, ac, x, y, z, ΔT, arch) + kernel! = calc_consume_kernel!(device(arch), 256, (size(ac,1))) + kernel!(ctsdic, plank, ac, x, y, z, ΔT) return nothing end -function calc_consume!(ctsdic, plank, ac, x, y, z, ΔT, ::CPU) - for i in 1:size(ac,1) - @inbounds ctsdic[x[i], y[i], z[i]] += (plank.RS[i] - plank.PS[i]) * ΔT * ac[i] - end - return nothing -end ##### deal with grazed or dead individuals -function gpu_calc_loss_kernel!(ctsdoc, ctspoc, plank, ac, x, y, z, loss, lossFracC) - index = (blockIdx().x - 1) * blockDim().x + threadIdx().x - stride = blockDim().x * gridDim().x - for i = index:stride:size(ac,1) - @inbounds CUDA.@atomic ctsdoc[x[i], y[i], z[i]] += plank.Bm[i] * lossFracC * ac[i] * loss[i] - @inbounds CUDA.@atomic ctspoc[x[i], y[i], z[i]] += plank.Bm[i] * (1.0-lossFracC) * ac[i] * loss[i] - end - return nothing -end -function calc_loss!(ctsdoc, ctspoc, plank, ac, x, y, z, - loss, lossFracC, ::GPU) - @cuda threads=256 blocks=ceil(Int, size(ac,1)/256) gpu_calc_loss_kernel!(ctsdoc, ctspoc, plank, ac, x, y, z, loss, lossFracC) - return nothing -end -function calc_loss!(ctsdoc, ctspoc, plank, ac, x, y, z, loss, lossFracC, ::CPU) - for i in 1:size(ac,1) - @inbounds ctsdoc[x[i], y[i], z[i]] += plank.Bm[i] * lossFracC * ac[i] * loss[i] - @inbounds ctspoc[x[i], y[i], z[i]] += plank.Bm[i] * (1.0-lossFracC) * ac[i] * loss[i] - end - return nothing +function calc_loss_kernel!(ctsdoc, ctspoc, plank, ac, x, y, z, loss, lossFracC) + i = @index(Global) + @inbounds KernelAbstractions.@atomic ctsdoc[x[i], y[i], z[i]] += plank.Bm[i] * lossFracC * ac[i] * loss[i] + @inbounds KernelAbstractions.@atomic ctspoc[x[i], y[i], z[i]] += plank.Bm[i] * (1.0-lossFracC) * ac[i] * loss[i] end +function calc_loss!(ctsdoc, ctspoc, plank, ac, x, y, z, loss, lossFracC, arch) + kernel! = calc_loss_kernel!(device(arch), 256, (size(ac,1))) + kernel!(ctsdoc, ctspoc, plank, ac, x, y, z, loss, lossFracC) +return nothing +end \ No newline at end of file diff --git a/src/Plankton/MacroMolecularMode/consume_loss.jl b/src/Plankton/MacroMolecularMode/consume_loss.jl index f83c7aad..9f589fdc 100644 --- a/src/Plankton/MacroMolecularMode/consume_loss.jl +++ b/src/Plankton/MacroMolecularMode/consume_loss.jl @@ -1,63 +1,33 @@ ##### deal with nutrients uptake -function gpu_calc_consume_kernel!(ctsdic, ctsdoc, ctsnh4, ctsno3, ctspo4, plank, ac, x, y, z, ΔT) - index = (blockIdx().x - 1) * blockDim().x + threadIdx().x - stride = blockDim().x * gridDim().x - for i = index:stride:size(ac,1) - @inbounds CUDA.@atomic ctsdic[x[i], y[i], z[i]] += (plank.resp[i] - plank.PS[i]) * ΔT * ac[i] - @inbounds CUDA.@atomic ctsdoc[x[i], y[i], z[i]] += (plank.exu[i] - plank.VDOC[i]) * ΔT * ac[i] - @inbounds CUDA.@atomic ctsnh4[x[i], y[i], z[i]] += -plank.VNH4[i] * ΔT * ac[i] - @inbounds CUDA.@atomic ctsno3[x[i], y[i], z[i]] += -plank.VNO3[i] * ΔT * ac[i] - @inbounds CUDA.@atomic ctspo4[x[i], y[i], z[i]] += -plank.VPO4[i] * ΔT * ac[i] - end - return nothing +function calc_consume_kernel!(ctsdic, ctsdoc, ctsnh4, ctsno3, ctspo4, plank, ac, x, y, z, ΔT) + i = @index(Global) + @inbounds KernelAbstractions.@atomic ctsdic[x[i], y[i], z[i]] += (plank.resp[i] - plank.PS[i]) * ΔT * ac[i] + @inbounds KernelAbstractions.@atomic ctsdoc[x[i], y[i], z[i]] += (plank.exu[i] - plank.VDOC[i]) * ΔT * ac[i] + @inbounds KernelAbstractions.@atomic ctsnh4[x[i], y[i], z[i]] += -plank.VNH4[i] * ΔT * ac[i] + @inbounds KernelAbstractions.@atomic ctsno3[x[i], y[i], z[i]] += -plank.VNO3[i] * ΔT * ac[i] + @inbounds KernelAbstractions.@atomic ctspo4[x[i], y[i], z[i]] += -plank.VPO4[i] * ΔT * ac[i] end -function calc_consume!(ctsdic, ctsdoc, ctsnh4, ctsno3, ctspo4, plank, ac, x, y, z, ΔT, ::GPU) - @cuda threads=256 blocks=ceil(Int, size(ac,1)/256) gpu_calc_consume_kernel!(ctsdic, ctsdoc, ctsnh4, - ctsno3, ctspo4, plank, ac, x, y, z, ΔT) +function calc_consume!(ctsdic, ctsdoc, ctsnh4, ctsno3, ctspo4, plank, ac, x, y, z, ΔT, arch) + kernel! = calc_consume_kernel!(device(arch), 256, (size(ac,1))) + kernel!(ctsdic, ctsdoc, ctsnh4, ctsno3, ctspo4, plank, ac, x, y, z, ΔT) return nothing end -function calc_consume!(ctsdic, ctsdoc, ctsnh4, ctsno3, ctspo4, plank, ac, x, y, z, ΔT, ::CPU) - for i in 1:size(ac,1) - @inbounds ctsdic[x[i], y[i], z[i]] += (plank.resp[i] - plank.PS[i]) * ΔT * ac[i] - @inbounds ctsdoc[x[i], y[i], z[i]] += (plank.exu[i] - plank.VDOC[i]) * ΔT * ac[i] - @inbounds ctsnh4[x[i], y[i], z[i]] += -plank.VNH4[i] * ΔT * ac[i] - @inbounds ctsno3[x[i], y[i], z[i]] += -plank.VNO3[i] * ΔT * ac[i] - @inbounds ctspo4[x[i], y[i], z[i]] += -plank.VPO4[i] * ΔT * ac[i] - end - return nothing -end ##### deal with grazed or dead individuals -function gpu_calc_loss_kernel!(ctsdoc, ctspoc, ctsdon, ctspon, ctsdop, ctspop, plank, ac, x, y, z, - loss, lossFracC, lossFracN, lossFracP, p) - index = (blockIdx().x - 1) * blockDim().x + threadIdx().x - stride = blockDim().x * gridDim().x - for i = index:stride:size(ac,1) - @inbounds CUDA.@atomic ctsdoc[x[i], y[i], z[i]] += total_C_biomass(plank.PRO[i], plank.DNA[i], plank.RNA[i], plank.CH[i], plank.Chl[i]) * lossFracC * ac[i] * loss[i] - @inbounds CUDA.@atomic ctsdon[x[i], y[i], z[i]] += total_N_biomass(plank.PRO[i], plank.DNA[i], plank.RNA[i], plank.NST[i], plank.Chl[i], p) * lossFracN * ac[i] * loss[i] - @inbounds CUDA.@atomic ctsdop[x[i], y[i], z[i]] += total_P_biomass(plank.DNA[i], plank.RNA[i], plank.PST[i], p) * lossFracP * ac[i] * loss[i] - @inbounds CUDA.@atomic ctspoc[x[i], y[i], z[i]] += total_C_biomass(plank.PRO[i], plank.DNA[i], plank.RNA[i], plank.CH[i], plank.Chl[i]) * (1.0-lossFracC) * ac[i] * loss[i] - @inbounds CUDA.@atomic ctspon[x[i], y[i], z[i]] += total_N_biomass(plank.PRO[i], plank.DNA[i], plank.RNA[i], plank.NST[i], plank.Chl[i], p) * (1.0-lossFracN) * ac[i] * loss[i] - @inbounds CUDA.@atomic ctspop[x[i], y[i], z[i]] += total_P_biomass(plank.DNA[i], plank.RNA[i], plank.PST[i], p) * (1.0-lossFracP) * ac[i] * loss[i] - end - return nothing -end -function calc_loss!(ctsdoc, ctspoc, ctsdon, ctspon, ctsdop, ctspop, plank, ac, x, y, z, - loss, lossFracC, lossFracN, lossFracP, p, ::GPU) - @cuda threads=256 blocks=ceil(Int, size(ac,1)/256) gpu_calc_loss_kernel!(ctsdoc, ctspoc, ctsdon, ctspon, - ctsdop, ctspop, plank, ac, x, y, z, loss, - lossFracC, lossFracN, lossFracP, p) - return nothing +function calc_loss_kernel!(ctsdoc, ctspoc, ctsdon, ctspon, ctsdop, ctspop, plank, ac, x, y, z, + loss, lossFracC, lossFracN, lossFracP, p) + i = @index(Global) + @inbounds KernelAbstractions.@atomic ctsdoc[x[i], y[i], z[i]] += total_C_biomass(plank.PRO[i], plank.DNA[i], plank.RNA[i], plank.CH[i], plank.Chl[i]) * lossFracC * ac[i] * loss[i] + @inbounds KernelAbstractions.@atomic ctsdon[x[i], y[i], z[i]] += total_N_biomass(plank.PRO[i], plank.DNA[i], plank.RNA[i], plank.NST[i], plank.Chl[i], p) * lossFracN * ac[i] * loss[i] + @inbounds KernelAbstractions.@atomic ctsdop[x[i], y[i], z[i]] += total_P_biomass(plank.DNA[i], plank.RNA[i], plank.PST[i], p) * lossFracP * ac[i] * loss[i] + @inbounds KernelAbstractions.@atomic ctspoc[x[i], y[i], z[i]] += total_C_biomass(plank.PRO[i], plank.DNA[i], plank.RNA[i], plank.CH[i], plank.Chl[i]) * (1.0-lossFracC) * ac[i] * loss[i] + @inbounds KernelAbstractions.@atomic ctspon[x[i], y[i], z[i]] += total_N_biomass(plank.PRO[i], plank.DNA[i], plank.RNA[i], plank.NST[i], plank.Chl[i], p) * (1.0-lossFracN) * ac[i] * loss[i] + @inbounds KernelAbstractions.@atomic ctspop[x[i], y[i], z[i]] += total_P_biomass(plank.DNA[i], plank.RNA[i], plank.PST[i], p) * (1.0-lossFracP) * ac[i] * loss[i] end function calc_loss!(ctsdoc, ctspoc, ctsdon, ctspon, ctsdop, ctspop, plank, ac, x, y, z, - loss, lossFracC, lossFracN, lossFracP, p, ::CPU) - for i in 1:size(ac,1) - @inbounds ctsdoc[x[i], y[i], z[i]] += total_C_biomass(plank.PRO[i], plank.DNA[i], plank.RNA[i], plank.CH[i], plank.Chl[i]) * lossFracC * ac[i] * loss[i] - @inbounds ctsdon[x[i], y[i], z[i]] += total_N_biomass(plank.PRO[i], plank.DNA[i], plank.RNA[i], plank.NST[i], plank.Chl[i], p) * lossFracN * ac[i] * loss[i] - @inbounds ctsdop[x[i], y[i], z[i]] += total_P_biomass(plank.DNA[i], plank.RNA[i], plank.PST[i], p) * lossFracP * ac[i] * loss[i] - @inbounds ctspoc[x[i], y[i], z[i]] += total_C_biomass(plank.PRO[i], plank.DNA[i], plank.RNA[i], plank.CH[i], plank.Chl[i]) * (1.0-lossFracC) * ac[i] * loss[i] - @inbounds ctspon[x[i], y[i], z[i]] += total_N_biomass(plank.PRO[i], plank.DNA[i], plank.RNA[i], plank.NST[i], plank.Chl[i], p) * (1.0-lossFracN) * ac[i] * loss[i] - @inbounds ctspop[x[i], y[i], z[i]] += total_P_biomass(plank.DNA[i], plank.RNA[i], plank.PST[i], p) * (1.0-lossFracP) * ac[i] * loss[i] - end - return nothing + loss, lossFracC, lossFracN, lossFracP, p, arch) + kernel! = calc_loss_kernel!(device(arch), 256, (size(ac,1))) + kernel!(ctsdoc, ctspoc, ctsdon, ctspon, ctsdop, ctspop, plank, ac, x, y, z, + loss, lossFracC, lossFracN, lossFracP, p) +return nothing end \ No newline at end of file diff --git a/src/Plankton/QuotaMode/consume_loss.jl b/src/Plankton/QuotaMode/consume_loss.jl index 15017287..6d33bc33 100644 --- a/src/Plankton/QuotaMode/consume_loss.jl +++ b/src/Plankton/QuotaMode/consume_loss.jl @@ -1,63 +1,33 @@ ##### deal with nutrients uptake -function gpu_calc_consume_kernel!(ctsdic, ctsdoc, ctsnh4, ctsno3, ctspo4, plank, ac, x, y, z, ΔT) - index = (blockIdx().x - 1) * blockDim().x + threadIdx().x - stride = blockDim().x * gridDim().x - for i = index:stride:size(ac,1) - @inbounds CUDA.@atomic ctsdic[x[i], y[i], z[i]] += (plank.resp[i] - plank.PS[i]) * ΔT * ac[i] - @inbounds CUDA.@atomic ctsdoc[x[i], y[i], z[i]] += (plank.exu[i] - plank.VDOC[i]) * ΔT * ac[i] - @inbounds CUDA.@atomic ctsnh4[x[i], y[i], z[i]] += -plank.VNH4[i] * ΔT * ac[i] - @inbounds CUDA.@atomic ctsno3[x[i], y[i], z[i]] += -plank.VNO3[i] * ΔT * ac[i] - @inbounds CUDA.@atomic ctspo4[x[i], y[i], z[i]] += -plank.VPO4[i] * ΔT * ac[i] - end - return nothing +function calc_consume_kernel!(ctsdic, ctsdoc, ctsnh4, ctsno3, ctspo4, plank, ac, x, y, z, ΔT) + i = @index(Global) + @inbounds KernelAbstractions.@atomic ctsdic[x[i], y[i], z[i]] += (plank.resp[i] - plank.PS[i]) * ΔT * ac[i] + @inbounds KernelAbstractions.@atomic ctsdoc[x[i], y[i], z[i]] += (plank.exu[i] - plank.VDOC[i]) * ΔT * ac[i] + @inbounds KernelAbstractions.@atomic ctsnh4[x[i], y[i], z[i]] += -plank.VNH4[i] * ΔT * ac[i] + @inbounds KernelAbstractions.@atomic ctsno3[x[i], y[i], z[i]] += -plank.VNO3[i] * ΔT * ac[i] + @inbounds KernelAbstractions.@atomic ctspo4[x[i], y[i], z[i]] += -plank.VPO4[i] * ΔT * ac[i] end -function calc_consume!(ctsdic, ctsdoc, ctsnh4, ctsno3, ctspo4, plank, ac, x, y, z, ΔT, ::GPU) - @cuda threads=256 blocks=ceil(Int, size(ac,1)/256) gpu_calc_consume_kernel!(ctsdic, ctsdoc, ctsnh4, - ctsno3, ctspo4, plank, ac, x, y, z, ΔT) +function calc_consume!(ctsdic, ctsdoc, ctsnh4, ctsno3, ctspo4, plank, ac, x, y, z, ΔT, arch) + kernel! = calc_consume_kernel!(device(arch), 256, (size(ac,1))) + kernel!(ctsdic, ctsdoc, ctsnh4, ctsno3, ctspo4, plank, ac, x, y, z, ΔT) return nothing end -function calc_consume!(ctsdic, ctsdoc, ctsnh4, ctsno3, ctspo4, plank, ac, x, y, z, ΔT, ::CPU) - for i in 1:size(ac,1) - @inbounds ctsdic[x[i], y[i], z[i]] += (plank.resp[i] - plank.PS[i]) * ΔT * ac[i] - @inbounds ctsdoc[x[i], y[i], z[i]] += (plank.exu[i] - plank.VDOC[i]) * ΔT * ac[i] - @inbounds ctsnh4[x[i], y[i], z[i]] += -plank.VNH4[i] * ΔT * ac[i] - @inbounds ctsno3[x[i], y[i], z[i]] += -plank.VNO3[i] * ΔT * ac[i] - @inbounds ctspo4[x[i], y[i], z[i]] += -plank.VPO4[i] * ΔT * ac[i] - end - return nothing -end ##### deal with grazed or dead individuals -function gpu_calc_loss_kernel!(ctsdoc, ctspoc, ctsdon, ctspon, ctsdop, ctspop, plank, ac, x, y, z, - loss, lossFracC, lossFracN, lossFracP, R_NC, R_PC) - index = (blockIdx().x - 1) * blockDim().x + threadIdx().x - stride = blockDim().x * gridDim().x - for i = index:stride:size(ac,1) - @inbounds CUDA.@atomic ctsdoc[x[i], y[i], z[i]] += (plank.Bm[i] + plank.Cq[i]) * lossFracC * ac[i] * loss[i] - @inbounds CUDA.@atomic ctsdon[x[i], y[i], z[i]] += (plank.Bm[i]*R_NC + plank.Nq[i]) * lossFracN * ac[i] * loss[i] - @inbounds CUDA.@atomic ctsdop[x[i], y[i], z[i]] += (plank.Bm[i]*R_PC + plank.Pq[i]) * lossFracP * ac[i] * loss[i] - @inbounds CUDA.@atomic ctspoc[x[i], y[i], z[i]] += (plank.Bm[i] + plank.Cq[i]) * (1.0-lossFracC) * ac[i] * loss[i] - @inbounds CUDA.@atomic ctspon[x[i], y[i], z[i]] += (plank.Bm[i]*R_NC + plank.Nq[i]) * (1.0-lossFracN) * ac[i] * loss[i] - @inbounds CUDA.@atomic ctspop[x[i], y[i], z[i]] += (plank.Bm[i]*R_PC + plank.Pq[i]) * (1.0-lossFracP) * ac[i] * loss[i] - end - return nothing -end -function calc_loss!(ctsdoc, ctspoc, ctsdon, ctspon, ctsdop, ctspop, plank, ac, x, y, z, - loss, lossFracC, lossFracN, lossFracP, R_NC, R_PC, ::GPU) - @cuda threads=256 blocks=ceil(Int, size(ac,1)/256) gpu_calc_loss_kernel!(ctsdoc, ctspoc, ctsdon, ctspon, - ctsdop, ctspop, plank, ac, x, y, z, loss, - lossFracC, lossFracN, lossFracP, R_NC, R_PC) - return nothing +function calc_loss_kernel!(ctsdoc, ctspoc, ctsdon, ctspon, ctsdop, ctspop, plank, ac, x, y, z, + loss, lossFracC, lossFracN, lossFracP, R_NC, R_PC) + i = @index(Global) + @inbounds KernelAbstractions.@atomic ctsdoc[x[i], y[i], z[i]] += (plank.Bm[i] + plank.Cq[i]) * lossFracC * ac[i] * loss[i] + @inbounds KernelAbstractions.@atomic ctsdon[x[i], y[i], z[i]] += (plank.Bm[i]*R_NC + plank.Nq[i]) * lossFracN * ac[i] * loss[i] + @inbounds KernelAbstractions.@atomic ctsdop[x[i], y[i], z[i]] += (plank.Bm[i]*R_PC + plank.Pq[i]) * lossFracP * ac[i] * loss[i] + @inbounds KernelAbstractions.@atomic ctspoc[x[i], y[i], z[i]] += (plank.Bm[i] + plank.Cq[i]) * (1.0-lossFracC) * ac[i] * loss[i] + @inbounds KernelAbstractions.@atomic ctspon[x[i], y[i], z[i]] += (plank.Bm[i]*R_NC + plank.Nq[i]) * (1.0-lossFracN) * ac[i] * loss[i] + @inbounds KernelAbstractions.@atomic ctspop[x[i], y[i], z[i]] += (plank.Bm[i]*R_PC + plank.Pq[i]) * (1.0-lossFracP) * ac[i] * loss[i] end function calc_loss!(ctsdoc, ctspoc, ctsdon, ctspon, ctsdop, ctspop, plank, ac, x, y, z, - loss, lossFracC, lossFracN, lossFracP, R_NC, R_PC, ::CPU) - for i in 1:size(ac,1) - @inbounds ctsdoc[x[i], y[i], z[i]] += (plank.Bm[i] + plank.Cq[i]) * lossFracC * ac[i] * loss[i] - @inbounds ctsdon[x[i], y[i], z[i]] += (plank.Bm[i]*R_NC + plank.Nq[i]) * lossFracN * ac[i] * loss[i] - @inbounds ctsdop[x[i], y[i], z[i]] += (plank.Bm[i]*R_PC + plank.Pq[i]) * lossFracP * ac[i] * loss[i] - @inbounds ctspoc[x[i], y[i], z[i]] += (plank.Bm[i] + plank.Cq[i]) * (1.0-lossFracC) * ac[i] * loss[i] - @inbounds ctspon[x[i], y[i], z[i]] += (plank.Bm[i]*R_NC + plank.Nq[i]) * (1.0-lossFracN) * ac[i] * loss[i] - @inbounds ctspop[x[i], y[i], z[i]] += (plank.Bm[i]*R_PC + plank.Pq[i]) * (1.0-lossFracP) * ac[i] * loss[i] - end - return nothing + loss, lossFracC, lossFracN, lossFracP, R_NC, R_PC, arch) + kernel! = calc_loss_kernel!(device(arch), 256, (size(ac,1))) + kernel!(ctsdoc, ctspoc, ctsdon, ctspon, ctsdop, ctspop, plank, ac, x, y, z, + loss, lossFracC, lossFracN, lossFracP, R_NC, R_PC) +return nothing end \ No newline at end of file diff --git a/src/Plankton/utils.jl b/src/Plankton/utils.jl index 832128f3..d43de649 100644 --- a/src/Plankton/utils.jl +++ b/src/Plankton/utils.jl @@ -39,26 +39,16 @@ function find_NPT!(nuts, x, y, z, ac, NH4, NO3, PO4, DOC, par, temp, pop, arch:: end ##### calculate Chla and individual counts based on the status of plankton individuals -@inline function gpu_acc_counts_kernel!(ctsChl, ctspop, Chl, ac, x, y, z) - index = (blockIdx().x - 1) * blockDim().x + threadIdx().x - stride = blockDim().x * gridDim().x - for i = index:stride:size(ac,1) - @inbounds CUDA.@atomic ctsChl[x[i], y[i], z[i]] += Chl[i] * ac[i] - @inbounds CUDA.@atomic ctspop[x[i], y[i], z[i]] += ac[i] - end - return nothing +function acc_counts_kernel!(ctsChl, ctspop, Chl, ac, x, y, z) + i = @index(Global) + @inbounds KernelAbstractions.@atomic ctsChl[x[i], y[i], z[i]] += Chl[i] * ac[i] + @inbounds KernelAbstractions.@atomic ctspop[x[i], y[i], z[i]] += ac[i] end -function acc_counts!(ctsChl, ctspop, Chl, ac, x, y, z, ::GPU) - @cuda threads=256 blocks=ceil(Int, size(ac,1)/256) gpu_acc_counts_kernel!(ctsChl, ctspop, Chl, ac, x, y, z) +function acc_counts!(ctsChl, ctspop, Chl, ac, x, y, z, arch) + kernel! = acc_counts_kernel!(device(arch), 256, (size(ac,1))) + kernel!(ctsChl, ctspop, Chl, ac, x, y, z) return nothing end -function acc_counts!(ctsChl, ctspop, Chl, ac, x, y, z, ::CPU) - for i in 1:size(ac,1) - @inbounds ctsChl[x[i], y[i], z[i]] += Chl[i] * ac[i] - @inbounds ctspop[x[i], y[i], z[i]] += ac[i] - end - return nothing -end ##### calculate PAR field based on Chla field and depth @kernel function calc_par_kernel!(par, Chl, PARF, g::AbstractGrid, kc, kw)