From ff2dd7e73d18d212a820ba4300dad5655c29343a Mon Sep 17 00:00:00 2001 From: sriharshakandala Date: Thu, 20 Jul 2023 21:36:29 -0700 Subject: [PATCH] Add reference GPU kernel --- examples/hybrid/tuning/mwe_tune_ke.jl | 114 +++++++++++++++++++++++++- 1 file changed, 113 insertions(+), 1 deletion(-) diff --git a/examples/hybrid/tuning/mwe_tune_ke.jl b/examples/hybrid/tuning/mwe_tune_ke.jl index cd48cf9fb9..b97a414118 100644 --- a/examples/hybrid/tuning/mwe_tune_ke.jl +++ b/examples/hybrid/tuning/mwe_tune_ke.jl @@ -12,7 +12,8 @@ import ClimaCore: Operators, Spaces, Topologies, - DataLayouts + DataLayouts, + RecursiveApply const C1 = ClimaCore.Geometry.Covariant1Vector const C2 = ClimaCore.Geometry.Covariant2Vector @@ -23,6 +24,8 @@ const CT123 = Geometry.Contravariant123Vector const ᶜinterp = Operators.InterpolateF2C() const ᶠinterp = Operators.InterpolateC2F() +const ⊞ = RecursiveApply.radd + init_uθ(ϕ, z, R) = 1.0 / R init_vθ(ϕ, z, R) = 1.0 / R init_w(ϕ, z) = 1.0 @@ -68,6 +71,92 @@ function compute_kinetic_ca!( #end end +function compute_kinetic_ref!( + κ::Fields.Field, + uₕ::Fields.Field, + uᵥ::Fields.Field, +) + hvc_lgd = Spaces.local_geometry_data(axes(uₕ)) # local geometry data for center extruded FD space + hvf_lgd = Spaces.local_geometry_data(axes(uᵥ)) # local geometry data for face extruded FD space + + κ_val = Fields.field_values(κ) # field values + uₕ_val = Fields.field_values(uₕ) + uᵥ_val = Fields.field_values(uᵥ) + + + (Nq, _, _, Nvc, Nh) = size(uₕ_val) # query dimensions + (Nq, _, _, Nvf, Nh) = size(uᵥ_val) + + nitems = Nvc * Nq * Nq * Nh # # of items that can be independently processed + max_threads = 256 + nthreads = min(max_threads, nitems) + nblocks = cld(nitems, nthreads) + + @cuda threads = (nthreads,) blocks = (nblocks,) compute_kinetic_ref_kernel!( + κ_val, + uₕ_val, + uᵥ_val, + hvc_lgd, + hvf_lgd, + (Nq, Nvc, Nh), + ) + + return κ +end + +function compute_kinetic_ref_kernel!( + κ_val, + uₕ_val, + uᵥ_val, + hvc_lgd, + hvf_lgd, + dims, +) + gid = threadIdx().x + (blockIdx().x - 1) * blockDim().x + (Nq, Nvc, Nh) = dims + if gid ≤ Nvc * Nq * Nq * Nh + h = cld(gid, Nvc * Nq * Nq) + offset = (h - 1) * Nvc * Nq * Nq + j = cld(gid - offset, Nvc * Nq) + offset += (j - 1) * Nvc * Nq + i = cld(gid - offset, Nvc) + vc = gid - offset - (i - 1) * Nvc + + idx = CartesianIndex((i, j, 1, vc, h)) + idx⁻ = idx + idx⁺ = CartesianIndex((i, j, 1, vc + 1, h)) + + c_lgd = hvc_lgd[i, j, nothing, vc, h] + + f_lgd⁺ = hvf_lgd[i, j, nothing, vc + 1, h] + f_lgd⁻ = hvf_lgd[i, j, nothing, vc, h] + + uₕ_val_idx = uₕ_val[idx] + uᵥ_val_idx⁺ = uᵥ_val[idx⁺] + uᵥ_val_idx⁻ = uᵥ_val[idx⁻] + + uₕ_c = C123(uₕ_val_idx, c_lgd) + uₕ_ct = CT123(uₕ_val_idx, c_lgd) + + uᵥ_c⁺ = C123(uᵥ_val_idx⁺, f_lgd⁺) + uᵥ_c⁻ = C123(uᵥ_val_idx⁻, f_lgd⁻) + uᵥ_c = RecursiveApply.rdiv(uᵥ_c⁺ ⊞ uᵥ_c⁻, 2) + + uᵥ_ct⁺ = CT123(uᵥ_val_idx⁺, f_lgd⁺) + uᵥ_ct⁻ = CT123(uᵥ_val_idx⁻, f_lgd⁻) + + κ_val[idx] = + ( + dot(uₕ_c, uₕ_ct) ⊞ RecursiveApply.rdiv( + dot(uᵥ_c⁺, uᵥ_ct⁺) ⊞ dot(uᵥ_c⁻, uᵥ_ct⁻), + 2, + ) ⊞ 2 * dot(uₕ_ct, uᵥ_c) + ) / 2 + end + + return nothing +end + function initialize_mwe(device, ::Type{FT}) where {FT} context = ClimaComms.SingletonCommsContext(device) R = FT(6.371229e6) @@ -125,6 +214,14 @@ function profile_compute_kinetic(::Type{FT}) where {FT} @show Array(parent(κ)) ≈ parent(κ_cpu) + + κ_ref, uₕ_ref, uᵥ_ref = initialize_mwe(ClimaComms.CUDADevice(), FT) + + @show "before ref version" maximum(parent(κ_ref)) + + κ_ref = compute_kinetic_ref!(κ_ref, uₕ_ref, uᵥ_ref) + @show "after ref version" maximum(parent(κ_ref)) + nreps = 10 for i in 1:nreps @@ -132,6 +229,21 @@ function profile_compute_kinetic(::Type{FT}) where {FT} CUDA.@sync κ = compute_kinetic_ca!(κ, uₕ, uᵥ) end end + + for i in 1:nreps + NVTX.@range "compute_kinetic_ref!" color = colorant"green" payload = i begin + CUDA.@sync κ_ref = compute_kinetic_ref!(κ_ref, uₕ_ref, uᵥ_ref) + end + end + + + for i in 1:nreps + t_ca = CUDA.@elapsed κ = compute_kinetic_ca!(κ, uₕ, uᵥ) + t_ref = + CUDA.@elapsed κ_ref = compute_kinetic_ref!(κ_ref, uₕ_ref, uᵥ_ref) + println("t_ca = $t_ca (sec); t_ref = $t_ref (sec)") + end + end profile_compute_kinetic(Float64)