From 6d4ef1380c902cfb04d74b15006b70e3ac8520ed Mon Sep 17 00:00:00 2001
From: AntonOresten <antonoresten@proton.me>
Date: Tue, 13 Jan 2026 15:17:05 +0100
Subject: [PATCH 1/4] add `occupancy` and `num_ctas` compile arguments

---
 ext/CUDAExt.jl                 |  23 +++--
 src/bytecode/writer.jl         |  69 ++++++++++++++-
 src/compiler/codegen/kernel.jl |  13 ++-
 src/compiler/reflection.jl     |  10 ++-
 test/entry_hints.jl            | 150 +++++++++++++++++++++++++++++++++
 5 files changed, 248 insertions(+), 17 deletions(-)
 create mode 100644 test/entry_hints.jl

diff --git a/ext/CUDAExt.jl b/ext/CUDAExt.jl
index c271549..b0cb87f 100644
--- a/ext/CUDAExt.jl
+++ b/ext/CUDAExt.jl
@@ -9,10 +9,10 @@ using CUDA_Compiler_jll
 public launch
 
 # Compilation cache - stores CuFunction directly to avoid re-loading CuModule
-const _compilation_cache = Dict{Any, Any}()  # (f, argtypes, sm_arch, opt_level) => CuFunction
+const _compilation_cache = Dict{Any, Any}()  # (f, argtypes, sm_arch, opt_level, num_ctas, occupancy) => CuFunction
 
 """
-    launch(f, grid, args...; name=nothing, sm_arch=default_sm_arch(), opt_level=3)
+    launch(f, grid, args...; name=nothing, sm_arch=default_sm_arch(), opt_level=3, num_ctas=nothing, occupancy=nothing)
 
 Compile and launch a kernel function with the given grid size and arguments.
 
@@ -26,6 +26,8 @@ are expanded to their constituent ptr, sizes, and strides parameters.
 - `name`: Optional kernel name for debugging
 - `sm_arch`: Target GPU architecture (default: current device's capability)
 - `opt_level`: Optimization level 0-3 (default: 3)
+- `num_ctas`: Number of CTAs in a CGA, 1-16, must be power of 2 (default: nothing)
+- `occupancy`: Expected active CTAs per SM, 1-32 (default: nothing)
 
 # Example
 ```julia
@@ -51,7 +53,9 @@ cuTile.launch(vadd_kernel, 64, a, b, c)
 function cuTile.launch(@nospecialize(f), grid, args...;
                        name::Union{String, Nothing}=nothing,
                        sm_arch::String=default_sm_arch(),
-                       opt_level::Int=3)
+                       opt_level::Int=3,
+                       num_ctas::Union{Int, Nothing}=nothing,
+                       occupancy::Union{Int, Nothing}=nothing)
     # Convert CuArray -> TileArray (and other conversions)
     tile_args = map(to_tile_arg, args)
 
@@ -62,10 +66,10 @@ function cuTile.launch(@nospecialize(f), grid, args...;
     kernel_name = name !== nothing ? name : string(nameof(f))
 
     # Check compilation cache - returns CuFunction directly
-    cache_key = (f, argtypes, sm_arch, opt_level)
+    cache_key = (f, argtypes, sm_arch, opt_level, num_ctas, occupancy)
     cufunc = get(_compilation_cache, cache_key, nothing)
     if cufunc === nothing || cuTile.compile_hook[] !== nothing
-        cubin = compile(f, argtypes; name, sm_arch, opt_level)
+        cubin = compile(f, argtypes; name, sm_arch, opt_level, num_ctas, occupancy)
         if cufunc === nothing
             cumod = CuModule(cubin)
             cufunc = CuFunction(cumod, kernel_name)
@@ -98,15 +102,18 @@ function cuTile.launch(@nospecialize(f), grid, args...;
 end
 
 """
-    compile(f, argtypes; name=nothing, sm_arch=default_sm_arch(), opt_level=3) -> Vector{UInt8}
+    compile(f, argtypes; name=nothing, sm_arch=default_sm_arch(), opt_level=3, num_ctas=nothing, occupancy=nothing) -> Vector{UInt8}
 
 Compile a Julia kernel function to a CUDA binary.
 """
 function compile(@nospecialize(f), @nospecialize(argtypes);
                  name::Union{String, Nothing}=nothing,
                  sm_arch::String=default_sm_arch(),
-                 opt_level::Int=3)
-    tile_bytecode = emit_tileir(f, argtypes; name)
+                 opt_level::Int=3,
+                 num_ctas::Union{Int, Nothing}=nothing,
+                 occupancy::Union{Int, Nothing}=nothing)
+    tile_bytecode = emit_tileir(f, argtypes; name, sm_arch,
+                                 num_ctas, occupancy)
 
     # Dump bytecode if JULIA_CUTILE_DUMP_BYTECODE is set
     dump_dir = get(ENV, "JULIA_CUTILE_DUMP_BYTECODE", nothing)
diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index c7a2ac3..0f6085a 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -507,9 +507,7 @@ function add_function!(writer::BytecodeWriter, func_buf::Vector{UInt8},
     flags = 0x00
     if is_entry
         flags |= 0x02
-        if entry_hints !== nothing
-            flags |= 0x04
-        end
+        isnothing(entry_hints) || (flags |= 0x04)
     end
     push!(func_buf, UInt8(flags))
 
@@ -542,3 +540,68 @@ function finalize_function!(func_buf::Vector{UInt8}, cb::CodeBuilder,
     encode_varint!(func_buf, length(cb.buf))
     append!(func_buf, cb.buf)
 end
+
+#=============================================================================
+ EntryHints: Kernel-level compilation hints
+=============================================================================#
+
+"""
+Kernel-level compilation hints (num_ctas, occupancy).
+Encoded as a dictionary attribute in bytecode.
+"""
+@kwdef struct EntryHints
+    num_ctas::Union{Int, Nothing} = nothing    # 1, 2, 4, 8, 16
+    occupancy::Union{Int, Nothing} = nothing   # 1-32
+end
+
+function validate_num_ctas(num_ctas::Union{Int, Nothing})
+    isnothing(num_ctas) && return
+    1 <= num_ctas <= 16 || throw(ArgumentError("num_ctas must be between 1 and 16, got $num_ctas"))
+    ispow2(num_ctas) || throw(ArgumentError("num_ctas must be a power of 2, got $num_ctas"))
+end
+
+function validate_occupancy(occupancy::Union{Int, Nothing})
+    isnothing(occupancy) && return
+    1 <= occupancy <= 32 || throw(ArgumentError("occupancy must be between 1 and 32, got $occupancy"))
+end
+
+"""
+Encode EntryHints as OptimizationHints format.
+Returns raw bytes for entry_hints parameter or nothing.
+"""
+function encode_entry_hints(writer::BytecodeWriter, sm_arch::String, hints::EntryHints)
+    validate_num_ctas(hints.num_ctas)
+    validate_occupancy(hints.occupancy)
+
+    # Build items list (only non-nothing values)
+    items = Tuple{String, Int}[]
+    isnothing(hints.num_ctas) || push!(items, ("num_cta_in_cga", hints.num_ctas))
+    isnothing(hints.occupancy) || push!(items, ("occupancy", hints.occupancy))
+    isempty(items) && return nothing
+
+    buf = UInt8[]
+
+    # Start with OptimizationHints tag
+    push!(buf, AttributeTag.OptimizationHints)
+
+    # Encode as architecture-specific dictionary
+    # Format: num_archs, then for each arch: arch_id, dictionary
+    encode_varint!(buf, 1)  # 1 architecture
+
+    # Architecture string ID
+    arch_id = writer.string_table[sm_arch]
+    encode_varint!(buf, arch_id.id)
+
+    # Encode dictionary
+    push!(buf, AttributeTag.Dictionary)
+    encode_varint!(buf, length(items))
+    for (key, value) in items
+        key_id = writer.string_table[key]
+        encode_varint!(buf, key_id.id)
+        push!(buf, AttributeTag.Integer)
+        encode_typeid!(buf, I32(writer.type_table))
+        encode_varint!(buf, UInt32(value))
+    end
+
+    return buf
+end
diff --git a/src/compiler/codegen/kernel.jl b/src/compiler/codegen/kernel.jl
index 93057f5..72d50bb 100644
--- a/src/compiler/codegen/kernel.jl
+++ b/src/compiler/codegen/kernel.jl
@@ -1,14 +1,17 @@
 # kernel and argument handling
 
 """
-    emit_kernel!(writer, func_buf, target; name, is_entry=true)
+    emit_kernel!(writer, func_buf, target; name, sm_arch, is_entry=true, num_ctas=nothing, occupancy=nothing)
 
 Compile a TileTarget to Tile IR bytecode.
 """
 function emit_kernel!(writer::BytecodeWriter, func_buf::Vector{UInt8},
                       target::TileTarget;
                       name::String = string(target.mi.def.name),
-                      is_entry::Bool = true)
+                      sm_arch::String = "sm_100",
+                      is_entry::Bool = true,
+                      num_ctas::Union{Int, Nothing} = nothing,
+                      occupancy::Union{Int, Nothing} = nothing)
     ctx = CGCtx(writer, target)
     tt = ctx.tt
 
@@ -58,8 +61,12 @@ function emit_kernel!(writer::BytecodeWriter, func_buf::Vector{UInt8},
         push!(result_types, tile_type_for_julia!(ctx, target.rettype))
     end
 
+    # Create entry hints if provided
+    entry_hints = encode_entry_hints(writer, sm_arch, EntryHints(; num_ctas, occupancy))
+
     # Create function
-    cb = add_function!(writer, func_buf, name, param_types, result_types; is_entry)
+    cb = add_function!(writer, func_buf, name, param_types, result_types;
+                       is_entry, entry_hints)
     ctx.cb = cb
 
     # Set up argument values
diff --git a/src/compiler/reflection.jl b/src/compiler/reflection.jl
index 6cc1090..44f36e6 100644
--- a/src/compiler/reflection.jl
+++ b/src/compiler/reflection.jl
@@ -1,12 +1,15 @@
 export code_tiled, @code_tiled
 
 """
-    emit_tileir(f, argtypes; name=nothing) -> Vector{UInt8}
+    emit_tileir(f, argtypes; name=nothing, sm_arch="sm_100", num_ctas=nothing, occupancy=nothing) -> Vector{UInt8}
 
 Compile a Julia function to Tile IR bytecode.
 """
 function emit_tileir(@nospecialize(f), @nospecialize(argtypes);
-                     name::Union{String, Nothing} = nothing)
+                     name::Union{String, Nothing} = nothing,
+                     sm_arch::String = "sm_100",
+                     num_ctas::Union{Int, Nothing} = nothing,
+                     occupancy::Union{Int, Nothing} = nothing)
     target = TileTarget(f, argtypes)
     kernel_name = name === nothing ? string(target.mi.def.name) : name
 
@@ -15,7 +18,8 @@ function emit_tileir(@nospecialize(f), @nospecialize(argtypes);
     end
 
     buf = write_bytecode!(1) do writer, func_buf
-        emit_kernel!(writer, func_buf, target; name=kernel_name)
+        emit_kernel!(writer, func_buf, target; name=kernel_name, sm_arch,
+                     num_ctas, occupancy)
     end
 
     return buf
diff --git a/test/entry_hints.jl b/test/entry_hints.jl
new file mode 100644
index 0000000..07653d2
--- /dev/null
+++ b/test/entry_hints.jl
@@ -0,0 +1,150 @@
+@testset "Entry Hints" begin
+
+    @testset "MLIR Encoding" begin
+        # Setup: Define spec for concrete types
+        spec1d = cuTile.ArraySpec{1}(16, true)
+
+        function simple_kernel(a::cuTile.TileArray{Float32, 1, spec1d})
+            pid = cuTile.bid(1)
+            t = cuTile.load(a, pid, (16,))
+            cuTile.store(a, pid, t)
+            return nothing
+        end
+
+        argtypes = Tuple{cuTile.TileArray{Float32, 1, spec1d}}
+
+        @testset "num_ctas only" begin
+            bytecode = cuTile.emit_tileir(simple_kernel, argtypes; num_ctas=4)
+            mlir = cuTile.disassemble_tileir(bytecode)
+            @test occursin("optimization_hints=<sm_100 = {num_cta_in_cga = 4}>", mlir)
+        end
+
+        @testset "occupancy only" begin
+            bytecode = cuTile.emit_tileir(simple_kernel, argtypes; occupancy=8)
+            mlir = cuTile.disassemble_tileir(bytecode)
+            @test occursin("optimization_hints=<sm_100 = {occupancy = 8}>", mlir)
+        end
+
+        @testset "both hints" begin
+            bytecode = cuTile.emit_tileir(simple_kernel, argtypes;
+                                          num_ctas=2, occupancy=4)
+            mlir = cuTile.disassemble_tileir(bytecode)
+            # Both should appear (order may vary)
+            @test occursin("num_cta_in_cga = 2", mlir)
+            @test occursin("occupancy = 4", mlir)
+            @test occursin("optimization_hints=<sm_100 = {", mlir)
+        end
+
+        @testset "no hints" begin
+            bytecode = cuTile.emit_tileir(simple_kernel, argtypes)
+            mlir = cuTile.disassemble_tileir(bytecode)
+            # Should NOT have optimization_hints attribute on entry function
+            @test !occursin("optimization_hints", mlir)
+        end
+
+        @testset "architecture parameter" begin
+            bytecode = cuTile.emit_tileir(simple_kernel, argtypes;
+                                          sm_arch="sm_120", num_ctas=4)
+            mlir = cuTile.disassemble_tileir(bytecode)
+            @test occursin("optimization_hints=<sm_120 = {num_cta_in_cga = 4}>", mlir)
+        end
+    end
+
+    @testset "Validation" begin
+        spec1d = cuTile.ArraySpec{1}(16, true)
+
+        function dummy_kernel(a::cuTile.TileArray{Float32, 1, spec1d})
+            return nothing
+        end
+
+        argtypes = Tuple{cuTile.TileArray{Float32, 1, spec1d}}
+
+        @testset "num_ctas validation" begin
+            # Too small
+            @test_throws "num_ctas must be between 1 and 16" begin
+                cuTile.emit_tileir(dummy_kernel, argtypes; num_ctas=0)
+            end
+
+            # Too large
+            @test_throws "num_ctas must be between 1 and 16" begin
+                cuTile.emit_tileir(dummy_kernel, argtypes; num_ctas=17)
+            end
+
+            # Not power of 2
+            @test_throws "num_ctas must be a power of 2" begin
+                cuTile.emit_tileir(dummy_kernel, argtypes; num_ctas=3)
+            end
+
+            @test_throws "num_ctas must be a power of 2" begin
+                cuTile.emit_tileir(dummy_kernel, argtypes; num_ctas=5)
+            end
+
+            # Valid values should succeed
+            for valid_num_ctas in [1, 2, 4, 8, 16]
+                bytecode = cuTile.emit_tileir(dummy_kernel, argtypes;
+                                              num_ctas=valid_num_ctas)
+                @test !isempty(bytecode)
+            end
+        end
+
+        @testset "occupancy validation" begin
+            # Too small
+            @test_throws "occupancy must be between 1 and 32" begin
+                cuTile.emit_tileir(dummy_kernel, argtypes; occupancy=0)
+            end
+
+            # Too large
+            @test_throws "occupancy must be between 1 and 32" begin
+                cuTile.emit_tileir(dummy_kernel, argtypes; occupancy=33)
+            end
+
+            # Valid boundaries
+            bytecode1 = cuTile.emit_tileir(dummy_kernel, argtypes; occupancy=1)
+            @test !isempty(bytecode1)
+
+            bytecode32 = cuTile.emit_tileir(dummy_kernel, argtypes; occupancy=32)
+            @test !isempty(bytecode32)
+        end
+    end
+
+    # Integration tests only run if CUDA is available
+    if isdefined(Main, :CUDA) && CUDA.functional()
+        @testset "Integration" begin
+            function vadd_kernel(a::cuTile.TileArray{Float32,1},
+                                b::cuTile.TileArray{Float32,1},
+                                c::cuTile.TileArray{Float32,1})
+                pid = cuTile.bid(1)
+                ta = cuTile.load(a, pid, (16,))
+                tb = cuTile.load(b, pid, (16,))
+                cuTile.store(c, pid, ta + tb)
+                return nothing
+            end
+
+            n = 1024
+            a = CUDA.ones(Float32, n)
+            b = CUDA.ones(Float32, n) .* 2
+            c = CUDA.zeros(Float32, n)
+
+            @testset "launch with num_ctas" begin
+                cuTile.launch(vadd_kernel, 64, a, b, c; num_ctas=2)
+                CUDA.synchronize()
+                @test Array(c) ≈ ones(Float32, n) .* 3
+            end
+
+            @testset "launch with occupancy" begin
+                fill!(c, 0.0f0)
+                cuTile.launch(vadd_kernel, 64, a, b, c; occupancy=4)
+                CUDA.synchronize()
+                @test Array(c) ≈ ones(Float32, n) .* 3
+            end
+
+            @testset "launch with both hints" begin
+                fill!(c, 0.0f0)
+                cuTile.launch(vadd_kernel, 64, a, b, c; num_ctas=4, occupancy=8)
+                CUDA.synchronize()
+                @test Array(c) ≈ ones(Float32, n) .* 3
+            end
+        end
+    end
+
+end

From ac9f6f611944418730ff3f9c556e5414d35620b6 Mon Sep 17 00:00:00 2001
From: Anton Oresten <antonoresten@proton.me>
Date: Tue, 13 Jan 2026 15:10:23 +0100
Subject: [PATCH 2/4] Update writer.jl

---
 src/bytecode/writer.jl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index 0f6085a..a04c7ff 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -507,7 +507,9 @@ function add_function!(writer::BytecodeWriter, func_buf::Vector{UInt8},
     flags = 0x00
     if is_entry
         flags |= 0x02
-        isnothing(entry_hints) || (flags |= 0x04)
+        if entry_hints !==
+            flags |= 0x04
+        end
     end
     push!(func_buf, UInt8(flags))
 

From ea12b3b8ce0541fbc77e64adf76245b13b556c4b Mon Sep 17 00:00:00 2001
From: Anton Oresten <antonoresten@proton.me>
Date: Tue, 13 Jan 2026 15:10:43 +0100
Subject: [PATCH 3/4] Update writer.jl

---
 src/bytecode/writer.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index a04c7ff..1d1d6fd 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -507,7 +507,7 @@ function add_function!(writer::BytecodeWriter, func_buf::Vector{UInt8},
     flags = 0x00
     if is_entry
         flags |= 0x02
-        if entry_hints !==
+        if entry_hints !== nothing
             flags |= 0x04
         end
     end

From 82f890285e059a79d58950a48fd8551f2c9c78cd Mon Sep 17 00:00:00 2001
From: AntonOresten <antonoresten@proton.me>
Date: Tue, 13 Jan 2026 20:57:28 +0100
Subject: [PATCH 4/4] fixes

---
 src/bytecode/writer.jl         |   7 +-
 src/compiler/codegen/kernel.jl |   4 +-
 src/compiler/reflection.jl     |  10 +--
 test/codegen.jl                | 115 +++++++++++++++++++++++++
 test/entry_hints.jl            | 150 ---------------------------------
 test/execution.jl              |  67 +++++++++++++++
 6 files changed, 194 insertions(+), 159 deletions(-)
 delete mode 100644 test/entry_hints.jl

diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index 1d1d6fd..bd2bc5d 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -571,7 +571,7 @@ end
 Encode EntryHints as OptimizationHints format.
 Returns raw bytes for entry_hints parameter or nothing.
 """
-function encode_entry_hints(writer::BytecodeWriter, sm_arch::String, hints::EntryHints)
+function encode_entry_hints(writer::BytecodeWriter, sm_arch::Union{String, Nothing}, hints::EntryHints)
     validate_num_ctas(hints.num_ctas)
     validate_occupancy(hints.occupancy)
 
@@ -581,6 +581,9 @@ function encode_entry_hints(writer::BytecodeWriter, sm_arch::String, hints::Entr
     isnothing(hints.occupancy) || push!(items, ("occupancy", hints.occupancy))
     isempty(items) && return nothing
 
+    # Use default architecture if not specified and hints are present
+    arch = @something sm_arch throw(ArgumentError("sm_arch must be specified when entry hints are present"))
+
     buf = UInt8[]
 
     # Start with OptimizationHints tag
@@ -591,7 +594,7 @@ function encode_entry_hints(writer::BytecodeWriter, sm_arch::String, hints::Entr
     encode_varint!(buf, 1)  # 1 architecture
 
     # Architecture string ID
-    arch_id = writer.string_table[sm_arch]
+    arch_id = writer.string_table[arch]
     encode_varint!(buf, arch_id.id)
 
     # Encode dictionary
diff --git a/src/compiler/codegen/kernel.jl b/src/compiler/codegen/kernel.jl
index 72d50bb..71a4f5c 100644
--- a/src/compiler/codegen/kernel.jl
+++ b/src/compiler/codegen/kernel.jl
@@ -1,14 +1,14 @@
 # kernel and argument handling
 
 """
-    emit_kernel!(writer, func_buf, target; name, sm_arch, is_entry=true, num_ctas=nothing, occupancy=nothing)
+    emit_kernel!(writer, func_buf, target; name, sm_arch=nothing, is_entry=true, num_ctas=nothing, occupancy=nothing)
 
 Compile a TileTarget to Tile IR bytecode.
 """
 function emit_kernel!(writer::BytecodeWriter, func_buf::Vector{UInt8},
                       target::TileTarget;
                       name::String = string(target.mi.def.name),
-                      sm_arch::String = "sm_100",
+                      sm_arch::Union{String, Nothing} = nothing,
                       is_entry::Bool = true,
                       num_ctas::Union{Int, Nothing} = nothing,
                       occupancy::Union{Int, Nothing} = nothing)
diff --git a/src/compiler/reflection.jl b/src/compiler/reflection.jl
index 44f36e6..12c4cb3 100644
--- a/src/compiler/reflection.jl
+++ b/src/compiler/reflection.jl
@@ -1,13 +1,13 @@
 export code_tiled, @code_tiled
 
 """
-    emit_tileir(f, argtypes; name=nothing, sm_arch="sm_100", num_ctas=nothing, occupancy=nothing) -> Vector{UInt8}
+    emit_tileir(f, argtypes; name, sm_arch, num_ctas, occupancy) -> Vector{UInt8}
 
 Compile a Julia function to Tile IR bytecode.
 """
 function emit_tileir(@nospecialize(f), @nospecialize(argtypes);
                      name::Union{String, Nothing} = nothing,
-                     sm_arch::String = "sm_100",
+                     sm_arch::Union{String, Nothing} = nothing,
                      num_ctas::Union{Int, Nothing} = nothing,
                      occupancy::Union{Int, Nothing} = nothing)
     target = TileTarget(f, argtypes)
@@ -35,14 +35,14 @@ function disassemble_tileir(bytecode::Vector{UInt8})::String
 end
 
 """
-    code_tiled(f, argtypes; name=nothing) -> String
+    code_tiled(f, argtypes; name, sm_arch, num_ctas, occupancy) -> String
 
 Return the CUDA Tile IR for a Julia function as a textual MLIR representation.
 Analogous to `code_typed` or `code_structured`.
 """
 function code_tiled(@nospecialize(f), @nospecialize(argtypes);
-                   name::Union{String, Nothing} = nothing)
-    bytecode = emit_tileir(f, argtypes; name)
+                   kwargs...)
+    bytecode = emit_tileir(f, argtypes; kwargs...)
     disassemble_tileir(bytecode)
 end
 
diff --git a/test/codegen.jl b/test/codegen.jl
index fa5a90e..7e04e27 100644
--- a/test/codegen.jl
+++ b/test/codegen.jl
@@ -1846,3 +1846,118 @@ end
         end
     end
 end
+
+#=============================================================================
+ Entry Hints (optimization_hints attribute)
+=============================================================================#
+
+@testset "Entry Hints" begin
+    # Common ArraySpecs for tests
+    spec1d = ct.ArraySpec{1}(16, true)
+
+    @testset "num_ctas only" begin
+        @test @filecheck begin
+            @check "optimization_hints=<sm_100 = {num_cta_in_cga = 4}>"
+            ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", num_ctas=4) do a
+                pid = ct.bid(1)
+                t = ct.load(a, pid, (16,))
+                ct.store(a, pid, t)
+                return nothing
+            end
+        end
+    end
+
+    @testset "occupancy only" begin
+        @test @filecheck begin
+            @check "optimization_hints=<sm_100 = {occupancy = 8}>"
+            ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", occupancy=8) do a
+                pid = ct.bid(1)
+                t = ct.load(a, pid, (16,))
+                ct.store(a, pid, t)
+                return nothing
+            end
+        end
+    end
+
+    @testset "both hints" begin
+        @test @filecheck begin
+            @check "optimization_hints=<sm_120 = {num_cta_in_cga = 2, occupancy = 4}"
+            ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120", num_ctas=2, occupancy=4) do a
+                pid = ct.bid(1)
+                t = ct.load(a, pid, (16,))
+                ct.store(a, pid, t)
+                return nothing
+            end
+        end
+    end
+
+    @testset "no hints" begin
+        @test @filecheck begin
+            @check_not "optimization_hints"
+            ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100") do a
+                pid = ct.bid(1)
+                t = ct.load(a, pid, (16,))
+                ct.store(a, pid, t)
+                return nothing
+            end
+        end
+    end
+
+    @testset "architecture parameter" begin
+        @test @filecheck begin
+            @check "optimization_hints=<sm_120 = {num_cta_in_cga = 4}>"
+            ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120", num_ctas=4) do a
+                pid = ct.bid(1)
+                t = ct.load(a, pid, (16,))
+                ct.store(a, pid, t)
+                return nothing
+            end
+        end
+    end
+
+    @testset "num_ctas validation" begin
+        # Too small
+        @test_throws "num_ctas must be between 1 and 16" begin
+            code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", num_ctas=0)
+        end
+
+        # Too large
+        @test_throws "num_ctas must be between 1 and 16" begin
+            code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", num_ctas=17)
+        end
+
+        # Not power of 2
+        @test_throws "num_ctas must be a power of 2" begin
+            code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", num_ctas=3)
+        end
+
+        @test_throws "num_ctas must be a power of 2" begin
+            code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", num_ctas=5)
+        end
+
+        # Valid values should succeed
+        for num_ctas in [1, 2, 4, 8, 16]
+            bytecode = code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", num_ctas)
+            @test !isempty(bytecode)
+        end
+    end
+
+    @testset "occupancy validation" begin
+        # Too small
+        @test_throws "occupancy must be between 1 and 32" begin
+            code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", occupancy=0)
+        end
+
+        # Too large
+        @test_throws "occupancy must be between 1 and 32" begin
+            code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", occupancy=33)
+        end
+
+        # Valid boundaries
+        bytecode1 = code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", occupancy=1)
+        @test !isempty(bytecode1)
+
+        bytecode32 = code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", occupancy=32)
+        @test !isempty(bytecode32)
+    end
+end
diff --git a/test/entry_hints.jl b/test/entry_hints.jl
deleted file mode 100644
index 07653d2..0000000
--- a/test/entry_hints.jl
+++ /dev/null
@@ -1,150 +0,0 @@
-@testset "Entry Hints" begin
-
-    @testset "MLIR Encoding" begin
-        # Setup: Define spec for concrete types
-        spec1d = cuTile.ArraySpec{1}(16, true)
-
-        function simple_kernel(a::cuTile.TileArray{Float32, 1, spec1d})
-            pid = cuTile.bid(1)
-            t = cuTile.load(a, pid, (16,))
-            cuTile.store(a, pid, t)
-            return nothing
-        end
-
-        argtypes = Tuple{cuTile.TileArray{Float32, 1, spec1d}}
-
-        @testset "num_ctas only" begin
-            bytecode = cuTile.emit_tileir(simple_kernel, argtypes; num_ctas=4)
-            mlir = cuTile.disassemble_tileir(bytecode)
-            @test occursin("optimization_hints=<sm_100 = {num_cta_in_cga = 4}>", mlir)
-        end
-
-        @testset "occupancy only" begin
-            bytecode = cuTile.emit_tileir(simple_kernel, argtypes; occupancy=8)
-            mlir = cuTile.disassemble_tileir(bytecode)
-            @test occursin("optimization_hints=<sm_100 = {occupancy = 8}>", mlir)
-        end
-
-        @testset "both hints" begin
-            bytecode = cuTile.emit_tileir(simple_kernel, argtypes;
-                                          num_ctas=2, occupancy=4)
-            mlir = cuTile.disassemble_tileir(bytecode)
-            # Both should appear (order may vary)
-            @test occursin("num_cta_in_cga = 2", mlir)
-            @test occursin("occupancy = 4", mlir)
-            @test occursin("optimization_hints=<sm_100 = {", mlir)
-        end
-
-        @testset "no hints" begin
-            bytecode = cuTile.emit_tileir(simple_kernel, argtypes)
-            mlir = cuTile.disassemble_tileir(bytecode)
-            # Should NOT have optimization_hints attribute on entry function
-            @test !occursin("optimization_hints", mlir)
-        end
-
-        @testset "architecture parameter" begin
-            bytecode = cuTile.emit_tileir(simple_kernel, argtypes;
-                                          sm_arch="sm_120", num_ctas=4)
-            mlir = cuTile.disassemble_tileir(bytecode)
-            @test occursin("optimization_hints=<sm_120 = {num_cta_in_cga = 4}>", mlir)
-        end
-    end
-
-    @testset "Validation" begin
-        spec1d = cuTile.ArraySpec{1}(16, true)
-
-        function dummy_kernel(a::cuTile.TileArray{Float32, 1, spec1d})
-            return nothing
-        end
-
-        argtypes = Tuple{cuTile.TileArray{Float32, 1, spec1d}}
-
-        @testset "num_ctas validation" begin
-            # Too small
-            @test_throws "num_ctas must be between 1 and 16" begin
-                cuTile.emit_tileir(dummy_kernel, argtypes; num_ctas=0)
-            end
-
-            # Too large
-            @test_throws "num_ctas must be between 1 and 16" begin
-                cuTile.emit_tileir(dummy_kernel, argtypes; num_ctas=17)
-            end
-
-            # Not power of 2
-            @test_throws "num_ctas must be a power of 2" begin
-                cuTile.emit_tileir(dummy_kernel, argtypes; num_ctas=3)
-            end
-
-            @test_throws "num_ctas must be a power of 2" begin
-                cuTile.emit_tileir(dummy_kernel, argtypes; num_ctas=5)
-            end
-
-            # Valid values should succeed
-            for valid_num_ctas in [1, 2, 4, 8, 16]
-                bytecode = cuTile.emit_tileir(dummy_kernel, argtypes;
-                                              num_ctas=valid_num_ctas)
-                @test !isempty(bytecode)
-            end
-        end
-
-        @testset "occupancy validation" begin
-            # Too small
-            @test_throws "occupancy must be between 1 and 32" begin
-                cuTile.emit_tileir(dummy_kernel, argtypes; occupancy=0)
-            end
-
-            # Too large
-            @test_throws "occupancy must be between 1 and 32" begin
-                cuTile.emit_tileir(dummy_kernel, argtypes; occupancy=33)
-            end
-
-            # Valid boundaries
-            bytecode1 = cuTile.emit_tileir(dummy_kernel, argtypes; occupancy=1)
-            @test !isempty(bytecode1)
-
-            bytecode32 = cuTile.emit_tileir(dummy_kernel, argtypes; occupancy=32)
-            @test !isempty(bytecode32)
-        end
-    end
-
-    # Integration tests only run if CUDA is available
-    if isdefined(Main, :CUDA) && CUDA.functional()
-        @testset "Integration" begin
-            function vadd_kernel(a::cuTile.TileArray{Float32,1},
-                                b::cuTile.TileArray{Float32,1},
-                                c::cuTile.TileArray{Float32,1})
-                pid = cuTile.bid(1)
-                ta = cuTile.load(a, pid, (16,))
-                tb = cuTile.load(b, pid, (16,))
-                cuTile.store(c, pid, ta + tb)
-                return nothing
-            end
-
-            n = 1024
-            a = CUDA.ones(Float32, n)
-            b = CUDA.ones(Float32, n) .* 2
-            c = CUDA.zeros(Float32, n)
-
-            @testset "launch with num_ctas" begin
-                cuTile.launch(vadd_kernel, 64, a, b, c; num_ctas=2)
-                CUDA.synchronize()
-                @test Array(c) ≈ ones(Float32, n) .* 3
-            end
-
-            @testset "launch with occupancy" begin
-                fill!(c, 0.0f0)
-                cuTile.launch(vadd_kernel, 64, a, b, c; occupancy=4)
-                CUDA.synchronize()
-                @test Array(c) ≈ ones(Float32, n) .* 3
-            end
-
-            @testset "launch with both hints" begin
-                fill!(c, 0.0f0)
-                cuTile.launch(vadd_kernel, 64, a, b, c; num_ctas=4, occupancy=8)
-                CUDA.synchronize()
-                @test Array(c) ≈ ones(Float32, n) .* 3
-            end
-        end
-    end
-
-end
diff --git a/test/execution.jl b/test/execution.jl
index 0edb38f..44e5114 100644
--- a/test/execution.jl
+++ b/test/execution.jl
@@ -1589,3 +1589,70 @@ end
 end
 
 end
+
+@testset "Entry Hints Integration" begin
+
+@testset "launch with num_ctas" begin
+    function vadd_kernel_num_ctas(a::ct.TileArray{Float32,1},
+                        b::ct.TileArray{Float32,1},
+                        c::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        ta = ct.load(a, pid, (16,))
+        tb = ct.load(b, pid, (16,))
+        ct.store(c, pid, ta + tb)
+        return nothing
+    end
+
+    n = 1024
+    a = CUDA.ones(Float32, n)
+    b = CUDA.ones(Float32, n) .* 2
+    c = CUDA.zeros(Float32, n)
+
+    ct.launch(vadd_kernel_num_ctas, 64, a, b, c; num_ctas=2)
+    CUDA.synchronize()
+    @test Array(c) ≈ ones(Float32, n) .* 3
+end
+
+@testset "launch with occupancy" begin
+    function vadd_kernel_occupancy(a::ct.TileArray{Float32,1},
+                        b::ct.TileArray{Float32,1},
+                        c::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        ta = ct.load(a, pid, (16,))
+        tb = ct.load(b, pid, (16,))
+        ct.store(c, pid, ta + tb)
+        return nothing
+    end
+
+    n = 1024
+    a = CUDA.ones(Float32, n)
+    b = CUDA.ones(Float32, n) .* 2
+    c = CUDA.zeros(Float32, n)
+
+    ct.launch(vadd_kernel_occupancy, 64, a, b, c; occupancy=4)
+    CUDA.synchronize()
+    @test Array(c) ≈ ones(Float32, n) .* 3
+end
+
+@testset "launch with both hints" begin
+    function vadd_kernel_both_hints(a::ct.TileArray{Float32,1},
+                        b::ct.TileArray{Float32,1},
+                        c::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        ta = ct.load(a, pid, (16,))
+        tb = ct.load(b, pid, (16,))
+        ct.store(c, pid, ta + tb)
+        return nothing
+    end
+
+    n = 1024
+    a = CUDA.ones(Float32, n)
+    b = CUDA.ones(Float32, n) .* 2
+    c = CUDA.zeros(Float32, n)
+
+    ct.launch(vadd_kernel_both_hints, 64, a, b, c; num_ctas=4, occupancy=8)
+    CUDA.synchronize()
+    @test Array(c) ≈ ones(Float32, n) .* 3
+end
+
+end