JuliaGPU · maleadt · Jan 19, 2026 · Jan 14, 2026 · Jan 14, 2026 · Jan 14, 2026
diff --git a/ext/CUDAExt.jl b/ext/CUDAExt.jl
@@ -9,7 +9,7 @@ using CUDA_Compiler_jll
 public launch
 
 # Compilation cache - stores CuFunction directly to avoid re-loading CuModule
-const _compilation_cache = Dict{Any, Any}()  # (f, argtypes, sm_arch, opt_level, num_ctas, occupancy) => CuFunction
+const _compilation_cache = Dict{Any, Any}()  # (method, argtypes, sm_arch, opt_level, num_ctas, occupancy) => CuFunction
 
 """
     launch(f, grid, args...; name=nothing, sm_arch=default_sm_arch(), opt_level=3, num_ctas=nothing, occupancy=nothing)
@@ -65,8 +65,11 @@ function cuTile.launch(@nospecialize(f), grid, args...;
     # Determine kernel name
     kernel_name = name !== nothing ? name : string(nameof(f))
 
+    # Use method instance in case of a redefinition
+    method = which(f, argtypes)
+
     # Check compilation cache - returns CuFunction directly
-    cache_key = (f, argtypes, sm_arch, opt_level, num_ctas, occupancy)
+    cache_key = (method, argtypes, sm_arch, opt_level, num_ctas, occupancy)
     cufunc = get(_compilation_cache, cache_key, nothing)
     if cufunc === nothing || cuTile.compile_hook[] !== nothing
         cubin = compile(f, argtypes; name, sm_arch, opt_level, num_ctas, occupancy)

diff --git a/test/execution.jl b/test/execution.jl
@@ -1999,3 +1999,39 @@ end
 end
 
 end
+
+@testset "redefine kernel method" begin
+    mod = @eval module $(gensym())
+        import cuTile as ct
+        function vadd_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, c::ct.TileArray{Float32,1})
+            pid = ct.bid(1)
+            ta = ct.load(a, (pid,), (16,))
+            tb = ct.load(b, (pid,), (16,))
+            ct.store(c, (pid,), ta + tb)
+            return
+        end
+    end
+
+    a = CUDA.ones(Float32, 1024)
+    b = CUDA.ones(Float32, 1024)
+    c = CUDA.zeros(Float32, 1024)
+
+    ct.launch(mod.vadd_kernel, 64, a, b, c)
+
+    @test Array(c) ≈ Array(a) + Array(b)
+
+    @eval mod begin
+        function vadd_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, c::ct.TileArray{Float32,1})
+            pid = ct.bid(1)
+            ta = ct.load(a, (pid,), (16,))
+            tb = ct.load(b, (pid,), (16,))
+            ct.store(c, (pid,), ta + tb * 2)
+            return
+        end
+    end
+
+    ct.launch(mod.vadd_kernel, 64, a, b, c)
+
+    @test Array(c) ≈ Array(a) + Array(b) * 2
+end
+