Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions ext/CUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ using CUDA_Compiler_jll
public launch

# Compilation cache - stores CuFunction directly to avoid re-loading CuModule
const _compilation_cache = Dict{Any, Any}() # (f, argtypes, sm_arch, opt_level, num_ctas, occupancy) => CuFunction
const _compilation_cache = Dict{Any, Any}() # (method, argtypes, sm_arch, opt_level, num_ctas, occupancy) => CuFunction

"""
launch(f, grid, args...; name=nothing, sm_arch=default_sm_arch(), opt_level=3, num_ctas=nothing, occupancy=nothing)
Expand Down Expand Up @@ -65,8 +65,11 @@ function cuTile.launch(@nospecialize(f), grid, args...;
# Determine kernel name
kernel_name = name !== nothing ? name : string(nameof(f))

# Use method instance in case of a redefinition
method = which(f, argtypes)

# Check compilation cache - returns CuFunction directly
cache_key = (f, argtypes, sm_arch, opt_level, num_ctas, occupancy)
cache_key = (method, argtypes, sm_arch, opt_level, num_ctas, occupancy)
cufunc = get(_compilation_cache, cache_key, nothing)
if cufunc === nothing || cuTile.compile_hook[] !== nothing
cubin = compile(f, argtypes; name, sm_arch, opt_level, num_ctas, occupancy)
Expand Down
36 changes: 36 additions & 0 deletions test/execution.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1999,3 +1999,39 @@ end
end

end

@testset "redefine kernel method" begin
mod = @eval module $(gensym())
import cuTile as ct
function vadd_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, c::ct.TileArray{Float32,1})
pid = ct.bid(1)
ta = ct.load(a, (pid,), (16,))
tb = ct.load(b, (pid,), (16,))
ct.store(c, (pid,), ta + tb)
return
end
end

a = CUDA.ones(Float32, 1024)
b = CUDA.ones(Float32, 1024)
c = CUDA.zeros(Float32, 1024)

ct.launch(mod.vadd_kernel, 64, a, b, c)

@test Array(c) ≈ Array(a) + Array(b)

@eval mod begin
function vadd_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, c::ct.TileArray{Float32,1})
pid = ct.bid(1)
ta = ct.load(a, (pid,), (16,))
tb = ct.load(b, (pid,), (16,))
ct.store(c, (pid,), ta + tb * 2)
return
end
end

ct.launch(mod.vadd_kernel, 64, a, b, c)

@test Array(c) ≈ Array(a) + Array(b) * 2
end