diff --git a/examples/hybrid/tuning/mwe_tune_ke.jl b/examples/hybrid/tuning/mwe_tune_ke.jl
new file mode 100644
index 0000000000..f518f89829
--- /dev/null
+++ b/examples/hybrid/tuning/mwe_tune_ke.jl
@@ -0,0 +1,275 @@
+using CUDA
+using ClimaComms
+using ClimaCore
+using LinearAlgebra
+using NVTX, Colors
+
+import ClimaCore:
+    Domains,
+    Fields,
+    Geometry,
+    Meshes,
+    Operators,
+    Spaces,
+    Topologies,
+    DataLayouts,
+    RecursiveApply
+
+const C1 = ClimaCore.Geometry.Covariant1Vector
+const C2 = ClimaCore.Geometry.Covariant2Vector
+const C3 = ClimaCore.Geometry.Covariant3Vector
+const C12 = ClimaCore.Geometry.Covariant12Vector
+const C123 = ClimaCore.Geometry.Covariant123Vector
+const CT123 = Geometry.Contravariant123Vector
+const ᶜinterp = Operators.InterpolateF2C()
+const ᶠinterp = Operators.InterpolateC2F()
+
+const ⊞ = RecursiveApply.radd
+
+init_uθ(ϕ, z, R) = 1.0 / R
+init_vθ(ϕ, z, R) = 1.0 / R
+init_w(ϕ, z) = 1.0
+
+function center_initial_condition(ᶜlocal_geometry, R)
+    (; lat, long, z) = ᶜlocal_geometry.coordinates
+    u₀ = @. init_uθ(lat, z, R)
+    v₀ = @. init_vθ(lat, z, R)
+    ᶜuₕ_local = @. Geometry.UVVector(u₀, v₀)
+    ᶜuₕ = @. Geometry.Covariant12Vector(ᶜuₕ_local, ᶜlocal_geometry)
+    return ᶜuₕ
+end
+
+function face_initial_condition(local_geometry)
+    (; lat, long, z) = local_geometry.coordinates
+    w = @. Geometry.Covariant3Vector(init_w(lat, z))
+    return w
+end
+
+# initialize a scalar field (for KE)
+function init_scalar_field(space)
+    Y = map(Fields.local_geometry_field(space)) do local_geometry
+        h = 0.0
+        return h
+    end
+    return Y
+end
+
+function compute_kinetic_ca!(
+    κ::Fields.Field,
+    uₕ::Fields.Field,
+    uᵥ::Fields.Field,
+)
+    @assert eltype(uₕ) <: Union{C1, C2, C12}
+    @assert eltype(uᵥ) <: C3
+    #NVTX.@range "compute_kinetic! kernel" color = colorant"brown" begin
+    @. κ =
+        1 / 2 * (
+            dot(C123(uₕ), CT123(uₕ)) +
+            ᶜinterp(dot(C123(uᵥ), CT123(uᵥ))) +
+            2 * dot(CT123(uₕ), ᶜinterp(C123(uᵥ)))
+        )
+    #end
+end
+
+function compute_kinetic_ref!(
+    κ::Fields.Field,
+    uₕ::Fields.Field,
+    uᵥ::Fields.Field,
+)
+    hvc_lgd = Spaces.local_geometry_data(axes(uₕ)) # local geometry data for center extruded FD space
+    hvf_lgd = Spaces.local_geometry_data(axes(uᵥ)) # local geometry data for face extruded FD space
+
+    κ_val = Fields.field_values(κ)   # field values
+    uₕ_val = Fields.field_values(uₕ)
+    uᵥ_val = Fields.field_values(uᵥ)
+
+
+    (Nq, _, _, Nvc, Nh) = size(uₕ_val) # query dimensions
+    (Nq, _, _, Nvf, Nh) = size(uᵥ_val)
+
+    nitems = Nvc * Nq * Nq * Nh # # of items that can be independently processed
+    max_threads = 256
+    nthreads = min(max_threads, nitems)
+    nblocks = cld(nitems, nthreads)
+
+    @cuda threads = (nthreads,) blocks = (nblocks,) compute_kinetic_ref_kernel!(
+        κ_val,
+        uₕ_val,
+        uᵥ_val,
+        hvc_lgd,
+        hvf_lgd,
+        (Nq, Nvc, Nh),
+    )
+
+    return κ
+end
+
+function compute_kinetic_ref_kernel!(
+    κ_val,
+    uₕ_val,
+    uᵥ_val,
+    hvc_lgd,
+    hvf_lgd,
+    dims,
+)
+    gid = threadIdx().x + (blockIdx().x - 1) * blockDim().x
+    (Nq, Nvc, Nh) = dims
+    if gid ≤ Nvc * Nq * Nq * Nh
+        h = cld(gid, Nvc * Nq * Nq)
+        offset = (h - 1) * Nvc * Nq * Nq
+        j = cld(gid - offset, Nvc * Nq)
+        offset += (j - 1) * Nvc * Nq
+        i = cld(gid - offset, Nvc)
+        vc = gid - offset - (i - 1) * Nvc
+
+        idx = CartesianIndex((i, j, 1, vc, h))
+        idx⁻ = idx
+        idx⁺ = CartesianIndex((i, j, 1, vc + 1, h))
+
+        c_lgd = hvc_lgd[i, j, nothing, vc, h]
+
+        f_lgd⁺ = hvf_lgd[i, j, nothing, vc + 1, h]
+        f_lgd⁻ = hvf_lgd[i, j, nothing, vc, h]
+
+        uₕ_val_idx = uₕ_val[idx]
+        uᵥ_val_idx⁺ = uᵥ_val[idx⁺]
+        uᵥ_val_idx⁻ = uᵥ_val[idx⁻]
+
+        uₕ_c = C123(uₕ_val_idx, c_lgd)
+        uₕ_ct = CT123(uₕ_val_idx, c_lgd)
+
+        uᵥ_c⁺ = C123(uᵥ_val_idx⁺, f_lgd⁺)
+        uᵥ_c⁻ = C123(uᵥ_val_idx⁻, f_lgd⁻)
+        uᵥ_c = RecursiveApply.rdiv(uᵥ_c⁺ ⊞ uᵥ_c⁻, 2)
+
+        uᵥ_ct⁺ = CT123(uᵥ_val_idx⁺, f_lgd⁺)
+        uᵥ_ct⁻ = CT123(uᵥ_val_idx⁻, f_lgd⁻)
+
+        κ_val[idx] =
+            (
+                dot(uₕ_c, uₕ_ct) ⊞ RecursiveApply.rdiv(
+                    dot(uᵥ_c⁺, uᵥ_ct⁺) ⊞ dot(uᵥ_c⁻, uᵥ_ct⁻),
+                    2,
+                ) ⊞ 2 * dot(uₕ_ct, uᵥ_c)
+            ) / 2
+    end
+
+    return nothing
+end
+
+function initialize_mwe(device, ::Type{FT}) where {FT}
+    context = ClimaComms.SingletonCommsContext(device)
+    R = FT(6.371229e6)
+
+    npoly = 3
+    z_max = FT(30e3)
+    z_elem = 10
+    h_elem = 12
+    println(
+        "initializing on $(context.device); h_elem = $h_elem; z_elem = $z_elem; npoly = $npoly; R = $R; z_max = $z_max; FT = $FT",
+    )
+    # horizontal space
+    domain = Domains.SphereDomain(R)
+    horizontal_mesh = Meshes.EquiangularCubedSphere(domain, h_elem)
+    horizontal_topology = Topologies.Topology2D(
+        context,
+        horizontal_mesh,
+        Topologies.spacefillingcurve(horizontal_mesh),
+    )
+    quad = Spaces.Quadratures.GLL{npoly + 1}()
+    h_space = Spaces.SpectralElementSpace2D(horizontal_topology, quad)
+
+    # vertical space
+    z_domain = Domains.IntervalDomain(
+        Geometry.ZPoint(zero(z_max)),
+        Geometry.ZPoint(z_max);
+        boundary_tags = (:bottom, :top),
+    )
+    z_mesh = Meshes.IntervalMesh(z_domain, nelems = z_elem)
+    z_topology = Topologies.IntervalTopology(context, z_mesh)
+
+    z_center_space = Spaces.CenterFiniteDifferenceSpace(z_topology)
+
+    z_face_space = Spaces.FaceFiniteDifferenceSpace(z_topology)
+
+    hv_center_space =
+        Spaces.ExtrudedFiniteDifferenceSpace(h_space, z_center_space)
+    hv_face_space = Spaces.FaceExtrudedFiniteDifferenceSpace(hv_center_space)
+
+    ᶜlocal_geometry = Fields.local_geometry_field(hv_center_space)
+    ᶠlocal_geometry = Fields.local_geometry_field(hv_face_space)
+    uₕ = center_initial_condition(ᶜlocal_geometry, R)
+    uᵥ = face_initial_condition(ᶠlocal_geometry)
+    κ = init_scalar_field(hv_center_space)
+
+    return (; κ = κ, uₕ = uₕ, uᵥ = uᵥ)
+end
+
+function profile_compute_kinetic(::Type{FT}) where {FT}
+    κ, uₕ, uᵥ = initialize_mwe(ClimaComms.CUDADevice(), FT)
+    κ_cpu, uₕ_cpu, uᵥ_cpu = initialize_mwe(ClimaComms.CPUSingleThreaded(), FT)
+    # compute kinetic energy
+    κ = compute_kinetic_ca!(κ, uₕ, uᵥ)
+    κ_cpu = compute_kinetic_ca!(κ_cpu, uₕ_cpu, uᵥ_cpu)
+
+    @show Array(parent(κ)) ≈ parent(κ_cpu)
+
+
+    κ_ref, uₕ_ref, uᵥ_ref = initialize_mwe(ClimaComms.CUDADevice(), FT)
+
+    @show "before ref version" maximum(parent(κ_ref))
+
+    κ_ref = compute_kinetic_ref!(κ_ref, uₕ_ref, uᵥ_ref)
+    @show "after ref version" maximum(parent(κ_ref))
+
+    nreps = 10
+
+    for i in 1:nreps
+        NVTX.@range "compute_kinetic_ca!" color = colorant"blue" payload = i begin
+            CUDA.@sync κ = compute_kinetic_ca!(κ, uₕ, uᵥ)
+        end
+    end
+
+    for i in 1:nreps
+        NVTX.@range "compute_kinetic_ref!" color = colorant"green" payload = i begin
+            CUDA.@sync κ_ref = compute_kinetic_ref!(κ_ref, uₕ_ref, uᵥ_ref)
+        end
+    end
+
+
+    for i in 1:nreps
+        t_ca = CUDA.@elapsed κ = compute_kinetic_ca!(κ, uₕ, uᵥ)
+        t_ref =
+            CUDA.@elapsed κ_ref = compute_kinetic_ref!(κ_ref, uₕ_ref, uᵥ_ref)
+        println("t_ca = $t_ca (sec); t_ref = $t_ref (sec)")
+    end
+    return nothing
+end
+
+#profile_compute_kinetic(Float64)
+
+function profile_compute_divergence(::Type{FT}) where {FT}
+    κ, uₕ, uᵥ = initialize_mwe(ClimaComms.CUDADevice(), FT)
+    κ_cpu, uₕ_cpu, uᵥ_cpu = initialize_mwe(ClimaComms.CPUSingleThreaded(), FT)
+    hdiv = Operators.Divergence()
+    horz_div_cpu = hdiv.(uₕ_cpu)
+    horz_div = hdiv.(uₕ)
+
+    nreps = 100
+
+    for i in 1:nreps
+        t_div = CUDA.@elapsed horz_div .= hdiv.(uₕ)
+        println("t_div = $t_div (sec)")
+    end
+
+    for i in 1:nreps
+        NVTX.@range "compute_horizontal_divergence!" color = colorant"blue" payload =
+            i begin
+            CUDA.@sync horz_div .= hdiv.(uₕ)
+        end
+    end
+
+    return nothing
+end
+
+profile_compute_divergence(Float64)
diff --git a/src/Geometry/conversions.jl b/src/Geometry/conversions.jl
index a6667370ea..34e8d98de4 100644
--- a/src/Geometry/conversions.jl
+++ b/src/Geometry/conversions.jl
@@ -37,7 +37,6 @@ Contravariant123Vector(
 ) where {T} =
     local_geometry.gⁱʲ * Covariant123Vector(zero(u[1]), zero(u[1]), u[1])
 
-
 ContravariantVector(
     u::CovariantVector{T, I},
     local_geometry::LocalGeometry{I},
diff --git a/src/Operators/spectralelement.jl b/src/Operators/spectralelement.jl
index 373c229533..6480e56ade 100644
--- a/src/Operators/spectralelement.jl
+++ b/src/Operators/spectralelement.jl
@@ -254,34 +254,40 @@ function Base.copyto!(
     Nh = Topologies.nlocalelems(Spaces.topology(space))
     Nv = Spaces.nlevels(space)
     # executed
+    max_threads = 256
+    nitems = Nv * Nq * Nq * Nh # # of independent items
+    (nthreads, nblocks) = Spaces._configure_threadblock(max_threads, nitems)
     @inbounds begin
-        @cuda threads = (Nq, Nq) blocks = (Nh, Nv) copyto_spectral_kernel!(
+        @cuda threads = (nthreads,) blocks = (nblocks,) copyto_spectral_kernel!(
             strip_space(out, space),
             strip_space(sbc, space),
             space,
+            Nv,
+            Nq,
+            Nh,
         )
     end
     return out
 end
 
-function copyto_spectral_kernel!(out::Fields.Field, sbc, space)
+function copyto_spectral_kernel!(out::Fields.Field, sbc, space, Nv, Nq, Nh)
     @inbounds begin
-        i = threadIdx().x
-        j = threadIdx().y
-        h = blockIdx().x
-        if space isa Spaces.AbstractSpectralElementSpace
-            v = nothing
-        elseif space isa Spaces.FaceExtrudedFiniteDifferenceSpace
-            v = blockIdx().y - half
-        elseif space isa Spaces.CenterExtrudedFiniteDifferenceSpace
-            v = blockIdx().y
-        else
-            error("Invalid space")
+        gid = threadIdx().x + (blockIdx().x - 1) * blockDim().x
+        if gid ≤ Nv * Nq * Nq * Nh
+            (v, i, j, h) = Spaces._get_idx((Nv, Nq, Nq, Nh), gid)
+            if space isa Spaces.AbstractSpectralElementSpace
+                slabidx = Fields.SlabIndex(nothing, h)
+            elseif space isa Spaces.FaceExtrudedFiniteDifferenceSpace
+                slabidx = Fields.SlabIndex(v - half, h)
+            elseif space isa Spaces.CenterExtrudedFiniteDifferenceSpace
+                slabidx = Fields.SlabIndex(v, h)
+            else
+                error("Invalid space")
+            end
+            ij = CartesianIndex((i, j))
+            result = get_node(space, sbc, ij, slabidx)
+            set_node!(space, out, ij, slabidx, result)
         end
-        ij = CartesianIndex((i, j))
-        slabidx = Fields.SlabIndex(v, h)
-        result = get_node(space, sbc, ij, slabidx)
-        set_node!(space, out, ij, slabidx, result)
     end
     return nothing
 end