From 6d4ef1380c902cfb04d74b15006b70e3ac8520ed Mon Sep 17 00:00:00 2001 From: AntonOresten Date: Tue, 13 Jan 2026 15:17:05 +0100 Subject: [PATCH 1/4] add `occupancy` and `num_ctas` compile arguments --- ext/CUDAExt.jl | 23 +++-- src/bytecode/writer.jl | 69 ++++++++++++++- src/compiler/codegen/kernel.jl | 13 ++- src/compiler/reflection.jl | 10 ++- test/entry_hints.jl | 150 +++++++++++++++++++++++++++++++++ 5 files changed, 248 insertions(+), 17 deletions(-) create mode 100644 test/entry_hints.jl diff --git a/ext/CUDAExt.jl b/ext/CUDAExt.jl index c271549..b0cb87f 100644 --- a/ext/CUDAExt.jl +++ b/ext/CUDAExt.jl @@ -9,10 +9,10 @@ using CUDA_Compiler_jll public launch # Compilation cache - stores CuFunction directly to avoid re-loading CuModule -const _compilation_cache = Dict{Any, Any}() # (f, argtypes, sm_arch, opt_level) => CuFunction +const _compilation_cache = Dict{Any, Any}() # (f, argtypes, sm_arch, opt_level, num_ctas, occupancy) => CuFunction """ - launch(f, grid, args...; name=nothing, sm_arch=default_sm_arch(), opt_level=3) + launch(f, grid, args...; name=nothing, sm_arch=default_sm_arch(), opt_level=3, num_ctas=nothing, occupancy=nothing) Compile and launch a kernel function with the given grid size and arguments. @@ -26,6 +26,8 @@ are expanded to their constituent ptr, sizes, and strides parameters. - `name`: Optional kernel name for debugging - `sm_arch`: Target GPU architecture (default: current device's capability) - `opt_level`: Optimization level 0-3 (default: 3) +- `num_ctas`: Number of CTAs in a CGA, 1-16, must be power of 2 (default: nothing) +- `occupancy`: Expected active CTAs per SM, 1-32 (default: nothing) # Example ```julia @@ -51,7 +53,9 @@ cuTile.launch(vadd_kernel, 64, a, b, c) function cuTile.launch(@nospecialize(f), grid, args...; name::Union{String, Nothing}=nothing, sm_arch::String=default_sm_arch(), - opt_level::Int=3) + opt_level::Int=3, + num_ctas::Union{Int, Nothing}=nothing, + occupancy::Union{Int, Nothing}=nothing) # Convert CuArray -> TileArray (and other conversions) tile_args = map(to_tile_arg, args) @@ -62,10 +66,10 @@ function cuTile.launch(@nospecialize(f), grid, args...; kernel_name = name !== nothing ? name : string(nameof(f)) # Check compilation cache - returns CuFunction directly - cache_key = (f, argtypes, sm_arch, opt_level) + cache_key = (f, argtypes, sm_arch, opt_level, num_ctas, occupancy) cufunc = get(_compilation_cache, cache_key, nothing) if cufunc === nothing || cuTile.compile_hook[] !== nothing - cubin = compile(f, argtypes; name, sm_arch, opt_level) + cubin = compile(f, argtypes; name, sm_arch, opt_level, num_ctas, occupancy) if cufunc === nothing cumod = CuModule(cubin) cufunc = CuFunction(cumod, kernel_name) @@ -98,15 +102,18 @@ function cuTile.launch(@nospecialize(f), grid, args...; end """ - compile(f, argtypes; name=nothing, sm_arch=default_sm_arch(), opt_level=3) -> Vector{UInt8} + compile(f, argtypes; name=nothing, sm_arch=default_sm_arch(), opt_level=3, num_ctas=nothing, occupancy=nothing) -> Vector{UInt8} Compile a Julia kernel function to a CUDA binary. """ function compile(@nospecialize(f), @nospecialize(argtypes); name::Union{String, Nothing}=nothing, sm_arch::String=default_sm_arch(), - opt_level::Int=3) - tile_bytecode = emit_tileir(f, argtypes; name) + opt_level::Int=3, + num_ctas::Union{Int, Nothing}=nothing, + occupancy::Union{Int, Nothing}=nothing) + tile_bytecode = emit_tileir(f, argtypes; name, sm_arch, + num_ctas, occupancy) # Dump bytecode if JULIA_CUTILE_DUMP_BYTECODE is set dump_dir = get(ENV, "JULIA_CUTILE_DUMP_BYTECODE", nothing) diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl index c7a2ac3..0f6085a 100644 --- a/src/bytecode/writer.jl +++ b/src/bytecode/writer.jl @@ -507,9 +507,7 @@ function add_function!(writer::BytecodeWriter, func_buf::Vector{UInt8}, flags = 0x00 if is_entry flags |= 0x02 - if entry_hints !== nothing - flags |= 0x04 - end + isnothing(entry_hints) || (flags |= 0x04) end push!(func_buf, UInt8(flags)) @@ -542,3 +540,68 @@ function finalize_function!(func_buf::Vector{UInt8}, cb::CodeBuilder, encode_varint!(func_buf, length(cb.buf)) append!(func_buf, cb.buf) end + +#============================================================================= + EntryHints: Kernel-level compilation hints +=============================================================================# + +""" +Kernel-level compilation hints (num_ctas, occupancy). +Encoded as a dictionary attribute in bytecode. +""" +@kwdef struct EntryHints + num_ctas::Union{Int, Nothing} = nothing # 1, 2, 4, 8, 16 + occupancy::Union{Int, Nothing} = nothing # 1-32 +end + +function validate_num_ctas(num_ctas::Union{Int, Nothing}) + isnothing(num_ctas) && return + 1 <= num_ctas <= 16 || throw(ArgumentError("num_ctas must be between 1 and 16, got $num_ctas")) + ispow2(num_ctas) || throw(ArgumentError("num_ctas must be a power of 2, got $num_ctas")) +end + +function validate_occupancy(occupancy::Union{Int, Nothing}) + isnothing(occupancy) && return + 1 <= occupancy <= 32 || throw(ArgumentError("occupancy must be between 1 and 32, got $occupancy")) +end + +""" +Encode EntryHints as OptimizationHints format. +Returns raw bytes for entry_hints parameter or nothing. +""" +function encode_entry_hints(writer::BytecodeWriter, sm_arch::String, hints::EntryHints) + validate_num_ctas(hints.num_ctas) + validate_occupancy(hints.occupancy) + + # Build items list (only non-nothing values) + items = Tuple{String, Int}[] + isnothing(hints.num_ctas) || push!(items, ("num_cta_in_cga", hints.num_ctas)) + isnothing(hints.occupancy) || push!(items, ("occupancy", hints.occupancy)) + isempty(items) && return nothing + + buf = UInt8[] + + # Start with OptimizationHints tag + push!(buf, AttributeTag.OptimizationHints) + + # Encode as architecture-specific dictionary + # Format: num_archs, then for each arch: arch_id, dictionary + encode_varint!(buf, 1) # 1 architecture + + # Architecture string ID + arch_id = writer.string_table[sm_arch] + encode_varint!(buf, arch_id.id) + + # Encode dictionary + push!(buf, AttributeTag.Dictionary) + encode_varint!(buf, length(items)) + for (key, value) in items + key_id = writer.string_table[key] + encode_varint!(buf, key_id.id) + push!(buf, AttributeTag.Integer) + encode_typeid!(buf, I32(writer.type_table)) + encode_varint!(buf, UInt32(value)) + end + + return buf +end diff --git a/src/compiler/codegen/kernel.jl b/src/compiler/codegen/kernel.jl index 93057f5..72d50bb 100644 --- a/src/compiler/codegen/kernel.jl +++ b/src/compiler/codegen/kernel.jl @@ -1,14 +1,17 @@ # kernel and argument handling """ - emit_kernel!(writer, func_buf, target; name, is_entry=true) + emit_kernel!(writer, func_buf, target; name, sm_arch, is_entry=true, num_ctas=nothing, occupancy=nothing) Compile a TileTarget to Tile IR bytecode. """ function emit_kernel!(writer::BytecodeWriter, func_buf::Vector{UInt8}, target::TileTarget; name::String = string(target.mi.def.name), - is_entry::Bool = true) + sm_arch::String = "sm_100", + is_entry::Bool = true, + num_ctas::Union{Int, Nothing} = nothing, + occupancy::Union{Int, Nothing} = nothing) ctx = CGCtx(writer, target) tt = ctx.tt @@ -58,8 +61,12 @@ function emit_kernel!(writer::BytecodeWriter, func_buf::Vector{UInt8}, push!(result_types, tile_type_for_julia!(ctx, target.rettype)) end + # Create entry hints if provided + entry_hints = encode_entry_hints(writer, sm_arch, EntryHints(; num_ctas, occupancy)) + # Create function - cb = add_function!(writer, func_buf, name, param_types, result_types; is_entry) + cb = add_function!(writer, func_buf, name, param_types, result_types; + is_entry, entry_hints) ctx.cb = cb # Set up argument values diff --git a/src/compiler/reflection.jl b/src/compiler/reflection.jl index 6cc1090..44f36e6 100644 --- a/src/compiler/reflection.jl +++ b/src/compiler/reflection.jl @@ -1,12 +1,15 @@ export code_tiled, @code_tiled """ - emit_tileir(f, argtypes; name=nothing) -> Vector{UInt8} + emit_tileir(f, argtypes; name=nothing, sm_arch="sm_100", num_ctas=nothing, occupancy=nothing) -> Vector{UInt8} Compile a Julia function to Tile IR bytecode. """ function emit_tileir(@nospecialize(f), @nospecialize(argtypes); - name::Union{String, Nothing} = nothing) + name::Union{String, Nothing} = nothing, + sm_arch::String = "sm_100", + num_ctas::Union{Int, Nothing} = nothing, + occupancy::Union{Int, Nothing} = nothing) target = TileTarget(f, argtypes) kernel_name = name === nothing ? string(target.mi.def.name) : name @@ -15,7 +18,8 @@ function emit_tileir(@nospecialize(f), @nospecialize(argtypes); end buf = write_bytecode!(1) do writer, func_buf - emit_kernel!(writer, func_buf, target; name=kernel_name) + emit_kernel!(writer, func_buf, target; name=kernel_name, sm_arch, + num_ctas, occupancy) end return buf diff --git a/test/entry_hints.jl b/test/entry_hints.jl new file mode 100644 index 0000000..07653d2 --- /dev/null +++ b/test/entry_hints.jl @@ -0,0 +1,150 @@ +@testset "Entry Hints" begin + + @testset "MLIR Encoding" begin + # Setup: Define spec for concrete types + spec1d = cuTile.ArraySpec{1}(16, true) + + function simple_kernel(a::cuTile.TileArray{Float32, 1, spec1d}) + pid = cuTile.bid(1) + t = cuTile.load(a, pid, (16,)) + cuTile.store(a, pid, t) + return nothing + end + + argtypes = Tuple{cuTile.TileArray{Float32, 1, spec1d}} + + @testset "num_ctas only" begin + bytecode = cuTile.emit_tileir(simple_kernel, argtypes; num_ctas=4) + mlir = cuTile.disassemble_tileir(bytecode) + @test occursin("optimization_hints=", mlir) + end + + @testset "occupancy only" begin + bytecode = cuTile.emit_tileir(simple_kernel, argtypes; occupancy=8) + mlir = cuTile.disassemble_tileir(bytecode) + @test occursin("optimization_hints=", mlir) + end + + @testset "both hints" begin + bytecode = cuTile.emit_tileir(simple_kernel, argtypes; + num_ctas=2, occupancy=4) + mlir = cuTile.disassemble_tileir(bytecode) + # Both should appear (order may vary) + @test occursin("num_cta_in_cga = 2", mlir) + @test occursin("occupancy = 4", mlir) + @test occursin("optimization_hints=", mlir) + end + end + + @testset "Validation" begin + spec1d = cuTile.ArraySpec{1}(16, true) + + function dummy_kernel(a::cuTile.TileArray{Float32, 1, spec1d}) + return nothing + end + + argtypes = Tuple{cuTile.TileArray{Float32, 1, spec1d}} + + @testset "num_ctas validation" begin + # Too small + @test_throws "num_ctas must be between 1 and 16" begin + cuTile.emit_tileir(dummy_kernel, argtypes; num_ctas=0) + end + + # Too large + @test_throws "num_ctas must be between 1 and 16" begin + cuTile.emit_tileir(dummy_kernel, argtypes; num_ctas=17) + end + + # Not power of 2 + @test_throws "num_ctas must be a power of 2" begin + cuTile.emit_tileir(dummy_kernel, argtypes; num_ctas=3) + end + + @test_throws "num_ctas must be a power of 2" begin + cuTile.emit_tileir(dummy_kernel, argtypes; num_ctas=5) + end + + # Valid values should succeed + for valid_num_ctas in [1, 2, 4, 8, 16] + bytecode = cuTile.emit_tileir(dummy_kernel, argtypes; + num_ctas=valid_num_ctas) + @test !isempty(bytecode) + end + end + + @testset "occupancy validation" begin + # Too small + @test_throws "occupancy must be between 1 and 32" begin + cuTile.emit_tileir(dummy_kernel, argtypes; occupancy=0) + end + + # Too large + @test_throws "occupancy must be between 1 and 32" begin + cuTile.emit_tileir(dummy_kernel, argtypes; occupancy=33) + end + + # Valid boundaries + bytecode1 = cuTile.emit_tileir(dummy_kernel, argtypes; occupancy=1) + @test !isempty(bytecode1) + + bytecode32 = cuTile.emit_tileir(dummy_kernel, argtypes; occupancy=32) + @test !isempty(bytecode32) + end + end + + # Integration tests only run if CUDA is available + if isdefined(Main, :CUDA) && CUDA.functional() + @testset "Integration" begin + function vadd_kernel(a::cuTile.TileArray{Float32,1}, + b::cuTile.TileArray{Float32,1}, + c::cuTile.TileArray{Float32,1}) + pid = cuTile.bid(1) + ta = cuTile.load(a, pid, (16,)) + tb = cuTile.load(b, pid, (16,)) + cuTile.store(c, pid, ta + tb) + return nothing + end + + n = 1024 + a = CUDA.ones(Float32, n) + b = CUDA.ones(Float32, n) .* 2 + c = CUDA.zeros(Float32, n) + + @testset "launch with num_ctas" begin + cuTile.launch(vadd_kernel, 64, a, b, c; num_ctas=2) + CUDA.synchronize() + @test Array(c) ≈ ones(Float32, n) .* 3 + end + + @testset "launch with occupancy" begin + fill!(c, 0.0f0) + cuTile.launch(vadd_kernel, 64, a, b, c; occupancy=4) + CUDA.synchronize() + @test Array(c) ≈ ones(Float32, n) .* 3 + end + + @testset "launch with both hints" begin + fill!(c, 0.0f0) + cuTile.launch(vadd_kernel, 64, a, b, c; num_ctas=4, occupancy=8) + CUDA.synchronize() + @test Array(c) ≈ ones(Float32, n) .* 3 + end + end + end + +end From ac9f6f611944418730ff3f9c556e5414d35620b6 Mon Sep 17 00:00:00 2001 From: Anton Oresten Date: Tue, 13 Jan 2026 15:10:23 +0100 Subject: [PATCH 2/4] Update writer.jl --- src/bytecode/writer.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl index 0f6085a..a04c7ff 100644 --- a/src/bytecode/writer.jl +++ b/src/bytecode/writer.jl @@ -507,7 +507,9 @@ function add_function!(writer::BytecodeWriter, func_buf::Vector{UInt8}, flags = 0x00 if is_entry flags |= 0x02 - isnothing(entry_hints) || (flags |= 0x04) + if entry_hints !== + flags |= 0x04 + end end push!(func_buf, UInt8(flags)) From ea12b3b8ce0541fbc77e64adf76245b13b556c4b Mon Sep 17 00:00:00 2001 From: Anton Oresten Date: Tue, 13 Jan 2026 15:10:43 +0100 Subject: [PATCH 3/4] Update writer.jl --- src/bytecode/writer.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl index a04c7ff..1d1d6fd 100644 --- a/src/bytecode/writer.jl +++ b/src/bytecode/writer.jl @@ -507,7 +507,7 @@ function add_function!(writer::BytecodeWriter, func_buf::Vector{UInt8}, flags = 0x00 if is_entry flags |= 0x02 - if entry_hints !== + if entry_hints !== nothing flags |= 0x04 end end From 82f890285e059a79d58950a48fd8551f2c9c78cd Mon Sep 17 00:00:00 2001 From: AntonOresten Date: Tue, 13 Jan 2026 20:57:28 +0100 Subject: [PATCH 4/4] fixes --- src/bytecode/writer.jl | 7 +- src/compiler/codegen/kernel.jl | 4 +- src/compiler/reflection.jl | 10 +-- test/codegen.jl | 115 +++++++++++++++++++++++++ test/entry_hints.jl | 150 --------------------------------- test/execution.jl | 67 +++++++++++++++ 6 files changed, 194 insertions(+), 159 deletions(-) delete mode 100644 test/entry_hints.jl diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl index 1d1d6fd..bd2bc5d 100644 --- a/src/bytecode/writer.jl +++ b/src/bytecode/writer.jl @@ -571,7 +571,7 @@ end Encode EntryHints as OptimizationHints format. Returns raw bytes for entry_hints parameter or nothing. """ -function encode_entry_hints(writer::BytecodeWriter, sm_arch::String, hints::EntryHints) +function encode_entry_hints(writer::BytecodeWriter, sm_arch::Union{String, Nothing}, hints::EntryHints) validate_num_ctas(hints.num_ctas) validate_occupancy(hints.occupancy) @@ -581,6 +581,9 @@ function encode_entry_hints(writer::BytecodeWriter, sm_arch::String, hints::Entr isnothing(hints.occupancy) || push!(items, ("occupancy", hints.occupancy)) isempty(items) && return nothing + # Use default architecture if not specified and hints are present + arch = @something sm_arch throw(ArgumentError("sm_arch must be specified when entry hints are present")) + buf = UInt8[] # Start with OptimizationHints tag @@ -591,7 +594,7 @@ function encode_entry_hints(writer::BytecodeWriter, sm_arch::String, hints::Entr encode_varint!(buf, 1) # 1 architecture # Architecture string ID - arch_id = writer.string_table[sm_arch] + arch_id = writer.string_table[arch] encode_varint!(buf, arch_id.id) # Encode dictionary diff --git a/src/compiler/codegen/kernel.jl b/src/compiler/codegen/kernel.jl index 72d50bb..71a4f5c 100644 --- a/src/compiler/codegen/kernel.jl +++ b/src/compiler/codegen/kernel.jl @@ -1,14 +1,14 @@ # kernel and argument handling """ - emit_kernel!(writer, func_buf, target; name, sm_arch, is_entry=true, num_ctas=nothing, occupancy=nothing) + emit_kernel!(writer, func_buf, target; name, sm_arch=nothing, is_entry=true, num_ctas=nothing, occupancy=nothing) Compile a TileTarget to Tile IR bytecode. """ function emit_kernel!(writer::BytecodeWriter, func_buf::Vector{UInt8}, target::TileTarget; name::String = string(target.mi.def.name), - sm_arch::String = "sm_100", + sm_arch::Union{String, Nothing} = nothing, is_entry::Bool = true, num_ctas::Union{Int, Nothing} = nothing, occupancy::Union{Int, Nothing} = nothing) diff --git a/src/compiler/reflection.jl b/src/compiler/reflection.jl index 44f36e6..12c4cb3 100644 --- a/src/compiler/reflection.jl +++ b/src/compiler/reflection.jl @@ -1,13 +1,13 @@ export code_tiled, @code_tiled """ - emit_tileir(f, argtypes; name=nothing, sm_arch="sm_100", num_ctas=nothing, occupancy=nothing) -> Vector{UInt8} + emit_tileir(f, argtypes; name, sm_arch, num_ctas, occupancy) -> Vector{UInt8} Compile a Julia function to Tile IR bytecode. """ function emit_tileir(@nospecialize(f), @nospecialize(argtypes); name::Union{String, Nothing} = nothing, - sm_arch::String = "sm_100", + sm_arch::Union{String, Nothing} = nothing, num_ctas::Union{Int, Nothing} = nothing, occupancy::Union{Int, Nothing} = nothing) target = TileTarget(f, argtypes) @@ -35,14 +35,14 @@ function disassemble_tileir(bytecode::Vector{UInt8})::String end """ - code_tiled(f, argtypes; name=nothing) -> String + code_tiled(f, argtypes; name, sm_arch, num_ctas, occupancy) -> String Return the CUDA Tile IR for a Julia function as a textual MLIR representation. Analogous to `code_typed` or `code_structured`. """ function code_tiled(@nospecialize(f), @nospecialize(argtypes); - name::Union{String, Nothing} = nothing) - bytecode = emit_tileir(f, argtypes; name) + kwargs...) + bytecode = emit_tileir(f, argtypes; kwargs...) disassemble_tileir(bytecode) end diff --git a/test/codegen.jl b/test/codegen.jl index fa5a90e..7e04e27 100644 --- a/test/codegen.jl +++ b/test/codegen.jl @@ -1846,3 +1846,118 @@ end end end end + +#============================================================================= + Entry Hints (optimization_hints attribute) +=============================================================================# + +@testset "Entry Hints" begin + # Common ArraySpecs for tests + spec1d = ct.ArraySpec{1}(16, true) + + @testset "num_ctas only" begin + @test @filecheck begin + @check "optimization_hints=" + ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", num_ctas=4) do a + pid = ct.bid(1) + t = ct.load(a, pid, (16,)) + ct.store(a, pid, t) + return nothing + end + end + end + + @testset "occupancy only" begin + @test @filecheck begin + @check "optimization_hints=" + ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", occupancy=8) do a + pid = ct.bid(1) + t = ct.load(a, pid, (16,)) + ct.store(a, pid, t) + return nothing + end + end + end + + @testset "both hints" begin + @test @filecheck begin + @check "optimization_hints=" + ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120", num_ctas=4) do a + pid = ct.bid(1) + t = ct.load(a, pid, (16,)) + ct.store(a, pid, t) + return nothing + end + end + end + + @testset "num_ctas validation" begin + # Too small + @test_throws "num_ctas must be between 1 and 16" begin + code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", num_ctas=0) + end + + # Too large + @test_throws "num_ctas must be between 1 and 16" begin + code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", num_ctas=17) + end + + # Not power of 2 + @test_throws "num_ctas must be a power of 2" begin + code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", num_ctas=3) + end + + @test_throws "num_ctas must be a power of 2" begin + code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", num_ctas=5) + end + + # Valid values should succeed + for num_ctas in [1, 2, 4, 8, 16] + bytecode = code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", num_ctas) + @test !isempty(bytecode) + end + end + + @testset "occupancy validation" begin + # Too small + @test_throws "occupancy must be between 1 and 32" begin + code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", occupancy=0) + end + + # Too large + @test_throws "occupancy must be between 1 and 32" begin + code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", occupancy=33) + end + + # Valid boundaries + bytecode1 = code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", occupancy=1) + @test !isempty(bytecode1) + + bytecode32 = code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", occupancy=32) + @test !isempty(bytecode32) + end +end diff --git a/test/entry_hints.jl b/test/entry_hints.jl deleted file mode 100644 index 07653d2..0000000 --- a/test/entry_hints.jl +++ /dev/null @@ -1,150 +0,0 @@ -@testset "Entry Hints" begin - - @testset "MLIR Encoding" begin - # Setup: Define spec for concrete types - spec1d = cuTile.ArraySpec{1}(16, true) - - function simple_kernel(a::cuTile.TileArray{Float32, 1, spec1d}) - pid = cuTile.bid(1) - t = cuTile.load(a, pid, (16,)) - cuTile.store(a, pid, t) - return nothing - end - - argtypes = Tuple{cuTile.TileArray{Float32, 1, spec1d}} - - @testset "num_ctas only" begin - bytecode = cuTile.emit_tileir(simple_kernel, argtypes; num_ctas=4) - mlir = cuTile.disassemble_tileir(bytecode) - @test occursin("optimization_hints=", mlir) - end - - @testset "occupancy only" begin - bytecode = cuTile.emit_tileir(simple_kernel, argtypes; occupancy=8) - mlir = cuTile.disassemble_tileir(bytecode) - @test occursin("optimization_hints=", mlir) - end - - @testset "both hints" begin - bytecode = cuTile.emit_tileir(simple_kernel, argtypes; - num_ctas=2, occupancy=4) - mlir = cuTile.disassemble_tileir(bytecode) - # Both should appear (order may vary) - @test occursin("num_cta_in_cga = 2", mlir) - @test occursin("occupancy = 4", mlir) - @test occursin("optimization_hints=", mlir) - end - end - - @testset "Validation" begin - spec1d = cuTile.ArraySpec{1}(16, true) - - function dummy_kernel(a::cuTile.TileArray{Float32, 1, spec1d}) - return nothing - end - - argtypes = Tuple{cuTile.TileArray{Float32, 1, spec1d}} - - @testset "num_ctas validation" begin - # Too small - @test_throws "num_ctas must be between 1 and 16" begin - cuTile.emit_tileir(dummy_kernel, argtypes; num_ctas=0) - end - - # Too large - @test_throws "num_ctas must be between 1 and 16" begin - cuTile.emit_tileir(dummy_kernel, argtypes; num_ctas=17) - end - - # Not power of 2 - @test_throws "num_ctas must be a power of 2" begin - cuTile.emit_tileir(dummy_kernel, argtypes; num_ctas=3) - end - - @test_throws "num_ctas must be a power of 2" begin - cuTile.emit_tileir(dummy_kernel, argtypes; num_ctas=5) - end - - # Valid values should succeed - for valid_num_ctas in [1, 2, 4, 8, 16] - bytecode = cuTile.emit_tileir(dummy_kernel, argtypes; - num_ctas=valid_num_ctas) - @test !isempty(bytecode) - end - end - - @testset "occupancy validation" begin - # Too small - @test_throws "occupancy must be between 1 and 32" begin - cuTile.emit_tileir(dummy_kernel, argtypes; occupancy=0) - end - - # Too large - @test_throws "occupancy must be between 1 and 32" begin - cuTile.emit_tileir(dummy_kernel, argtypes; occupancy=33) - end - - # Valid boundaries - bytecode1 = cuTile.emit_tileir(dummy_kernel, argtypes; occupancy=1) - @test !isempty(bytecode1) - - bytecode32 = cuTile.emit_tileir(dummy_kernel, argtypes; occupancy=32) - @test !isempty(bytecode32) - end - end - - # Integration tests only run if CUDA is available - if isdefined(Main, :CUDA) && CUDA.functional() - @testset "Integration" begin - function vadd_kernel(a::cuTile.TileArray{Float32,1}, - b::cuTile.TileArray{Float32,1}, - c::cuTile.TileArray{Float32,1}) - pid = cuTile.bid(1) - ta = cuTile.load(a, pid, (16,)) - tb = cuTile.load(b, pid, (16,)) - cuTile.store(c, pid, ta + tb) - return nothing - end - - n = 1024 - a = CUDA.ones(Float32, n) - b = CUDA.ones(Float32, n) .* 2 - c = CUDA.zeros(Float32, n) - - @testset "launch with num_ctas" begin - cuTile.launch(vadd_kernel, 64, a, b, c; num_ctas=2) - CUDA.synchronize() - @test Array(c) ≈ ones(Float32, n) .* 3 - end - - @testset "launch with occupancy" begin - fill!(c, 0.0f0) - cuTile.launch(vadd_kernel, 64, a, b, c; occupancy=4) - CUDA.synchronize() - @test Array(c) ≈ ones(Float32, n) .* 3 - end - - @testset "launch with both hints" begin - fill!(c, 0.0f0) - cuTile.launch(vadd_kernel, 64, a, b, c; num_ctas=4, occupancy=8) - CUDA.synchronize() - @test Array(c) ≈ ones(Float32, n) .* 3 - end - end - end - -end diff --git a/test/execution.jl b/test/execution.jl index 0edb38f..44e5114 100644 --- a/test/execution.jl +++ b/test/execution.jl @@ -1589,3 +1589,70 @@ end end end + +@testset "Entry Hints Integration" begin + +@testset "launch with num_ctas" begin + function vadd_kernel_num_ctas(a::ct.TileArray{Float32,1}, + b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + tb = ct.load(b, pid, (16,)) + ct.store(c, pid, ta + tb) + return nothing + end + + n = 1024 + a = CUDA.ones(Float32, n) + b = CUDA.ones(Float32, n) .* 2 + c = CUDA.zeros(Float32, n) + + ct.launch(vadd_kernel_num_ctas, 64, a, b, c; num_ctas=2) + CUDA.synchronize() + @test Array(c) ≈ ones(Float32, n) .* 3 +end + +@testset "launch with occupancy" begin + function vadd_kernel_occupancy(a::ct.TileArray{Float32,1}, + b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + tb = ct.load(b, pid, (16,)) + ct.store(c, pid, ta + tb) + return nothing + end + + n = 1024 + a = CUDA.ones(Float32, n) + b = CUDA.ones(Float32, n) .* 2 + c = CUDA.zeros(Float32, n) + + ct.launch(vadd_kernel_occupancy, 64, a, b, c; occupancy=4) + CUDA.synchronize() + @test Array(c) ≈ ones(Float32, n) .* 3 +end + +@testset "launch with both hints" begin + function vadd_kernel_both_hints(a::ct.TileArray{Float32,1}, + b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + tb = ct.load(b, pid, (16,)) + ct.store(c, pid, ta + tb) + return nothing + end + + n = 1024 + a = CUDA.ones(Float32, n) + b = CUDA.ones(Float32, n) .* 2 + c = CUDA.zeros(Float32, n) + + ct.launch(vadd_kernel_both_hints, 64, a, b, c; num_ctas=4, occupancy=8) + CUDA.synchronize() + @test Array(c) ≈ ones(Float32, n) .* 3 +end + +end