From 5a603b2a7f9d97b5c0abda98774eef71c571ce80 Mon Sep 17 00:00:00 2001 From: AntonOresten Date: Wed, 14 Jan 2026 01:11:14 +0100 Subject: [PATCH 1/2] expose load/store/gather/scatter optimization hints --- src/bytecode/encodings.jl | 16 +- src/bytecode/writer.jl | 76 ++++++- src/compiler/codegen/kernel.jl | 2 +- src/compiler/intrinsics.jl | 8 + src/compiler/intrinsics/memory.jl | 62 ++++-- src/compiler/intrinsics/views.jl | 69 ++++-- src/compiler/target.jl | 6 +- src/language/operations.jl | 112 +++++++--- test/codegen.jl | 160 +++++++++++++- test/execution.jl | 345 +++++++++++++++++++++++++++++- 10 files changed, 765 insertions(+), 91 deletions(-) diff --git a/src/bytecode/encodings.jl b/src/bytecode/encodings.jl index 9305bee..9f06415 100644 --- a/src/bytecode/encodings.jl +++ b/src/bytecode/encodings.jl @@ -423,7 +423,7 @@ function encode_LoadViewTkoOp!(cb::CodeBuilder, token::Union{Value, Nothing}=nothing, memory_ordering::MemoryOrderingSemantics=MemoryWeak, memory_scope::Union{MemoryScope, Nothing}=nothing, - optimization_hints::Union{Vector{UInt8}, Nothing}=nothing) + optimization_hints::Union{OptimizationHints, Nothing}=nothing) encode_varint!(cb.buf, Opcode.LoadViewTkoOp) # Variadic result types encode_typeid_seq!(cb.buf, [tile_type, token_type]) @@ -447,7 +447,7 @@ function encode_LoadViewTkoOp!(cb::CodeBuilder, encode_enum!(cb.buf, memory_scope) end if optimization_hints !== nothing - append!(cb.buf, optimization_hints) + encode_opattr_optimization_hints!(cb, optimization_hints) end # Operands @@ -472,7 +472,7 @@ function encode_StoreViewTkoOp!(cb::CodeBuilder, token::Union{Value, Nothing}=nothing, memory_ordering::MemoryOrderingSemantics=MemoryWeak, memory_scope::Union{MemoryScope, Nothing}=nothing, - optimization_hints::Union{Vector{UInt8}, Nothing}=nothing) + optimization_hints::Union{OptimizationHints, Nothing}=nothing) encode_varint!(cb.buf, Opcode.StoreViewTkoOp) # Variadic result types (just token) encode_typeid_seq!(cb.buf, [token_type]) @@ -496,7 +496,7 @@ function encode_StoreViewTkoOp!(cb::CodeBuilder, encode_enum!(cb.buf, memory_scope) end if optimization_hints !== nothing - append!(cb.buf, optimization_hints) + encode_opattr_optimization_hints!(cb, optimization_hints) end # Operands @@ -541,7 +541,7 @@ function encode_LoadPtrTkoOp!(cb::CodeBuilder, token::Union{Value, Nothing}=nothing, memory_ordering::MemoryOrderingSemantics=MemoryWeak, memory_scope::Union{MemoryScope, Nothing}=nothing, - optimization_hints::Union{Vector{UInt8}, Nothing}=nothing) + optimization_hints::Union{OptimizationHints, Nothing}=nothing) encode_varint!(cb.buf, Opcode.LoadPtrTkoOp) # Result types encode_typeid!(cb.buf, result_type) @@ -572,7 +572,7 @@ function encode_LoadPtrTkoOp!(cb::CodeBuilder, encode_enum!(cb.buf, memory_scope) end if optimization_hints !== nothing - append!(cb.buf, optimization_hints) + encode_opattr_optimization_hints!(cb, optimization_hints) end # Operands @@ -600,7 +600,7 @@ function encode_StorePtrTkoOp!(cb::CodeBuilder, token::Union{Value, Nothing}=nothing, memory_ordering::MemoryOrderingSemantics=MemoryWeak, memory_scope::Union{MemoryScope, Nothing}=nothing, - optimization_hints::Union{Vector{UInt8}, Nothing}=nothing) + optimization_hints::Union{OptimizationHints, Nothing}=nothing) encode_varint!(cb.buf, Opcode.StorePtrTkoOp) # Result type (token) encode_typeid!(cb.buf, token_type) @@ -627,7 +627,7 @@ function encode_StorePtrTkoOp!(cb::CodeBuilder, encode_enum!(cb.buf, memory_scope) end if optimization_hints !== nothing - append!(cb.buf, optimization_hints) + encode_opattr_optimization_hints!(cb, optimization_hints) end # Operands diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl index bd2bc5d..67c3df1 100644 --- a/src/bytecode/writer.jl +++ b/src/bytecode/writer.jl @@ -544,9 +544,75 @@ function finalize_function!(func_buf::Vector{UInt8}, cb::CodeBuilder, end #============================================================================= - EntryHints: Kernel-level compilation hints + Optimization and Entry Hints =============================================================================# +""" + encode_tagged_value!(cb, value) + +Encode a value with its type tag. +""" +function encode_tagged_value!(buf::Vector{UInt8}, type_table::TypeTable, value::Bool) + push!(buf, AttributeTag.Bool) + push!(buf, value) +end + +function encode_tagged_value!(buf::Vector{UInt8}, type_table::TypeTable, value::Integer) + push!(buf, AttributeTag.Integer) + encode_typeid!(buf, I32(type_table)) + encode_varint!(buf, UInt32(value)) +end + +""" +Optimization hints for load/store operations. +- `latency`: Optional latency hint (1-10), or nothing for default +- `allow_tma`: Whether TMA (Tensor Memory Accelerator) is allowed (default: true) +""" +@kwdef struct LoadStoreHints + latency::Union{Int, Nothing} = nothing + allow_tma::Bool = true +end + +""" +Optimization hints for load/store operations. +- `hints_by_arch`: List of (SM architecture, load/store hints) pairs +""" +struct OptimizationHints + hints_by_arch::Vector{Tuple{String, LoadStoreHints}} +end + +function make_load_store_hints(sm_arch::Union{String, Nothing}, hints::LoadStoreHints) + isnothing(sm_arch) && throw(ArgumentError("sm_arch must be explicitly passed when load/store hints are present")) + OptimizationHints([(sm_arch, hints)]) +end + +function encode_opattr_optimization_hints!(cb::CodeBuilder, hints::OptimizationHints) + # Outer dictionary: arch -> hints_dict + encode_varint!(cb.buf, length(hints.hints_by_arch)) + for (arch, load_store_hints) in hints.hints_by_arch + arch_id = cb.string_table[arch] + encode_varint!(cb.buf, arch_id.id) + # Encode hints as inner dictionary (tagged) + encode_load_store_hints_dict!(cb, load_store_hints) + end +end + +function encode_load_store_hints_dict!(cb::CodeBuilder, hints::LoadStoreHints) + # Build list of (key, value) pairs for non-default hints + items = Tuple{String, Any}[] + hints.allow_tma || push!(items, ("allow_tma", false)) + isnothing(hints.latency) || push!(items, ("latency", hints.latency)) + + # Encode dictionary + push!(cb.buf, AttributeTag.Dictionary) + encode_varint!(cb.buf, length(items)) + for (key, value) in items + key_id = cb.string_table[key] + encode_varint!(cb.buf, key_id.id) + encode_tagged_value!(cb.buf, cb.type_table, value) + end +end + """ Kernel-level compilation hints (num_ctas, occupancy). Encoded as a dictionary attribute in bytecode. @@ -567,10 +633,6 @@ function validate_occupancy(occupancy::Union{Int, Nothing}) 1 <= occupancy <= 32 || throw(ArgumentError("occupancy must be between 1 and 32, got $occupancy")) end -""" -Encode EntryHints as OptimizationHints format. -Returns raw bytes for entry_hints parameter or nothing. -""" function encode_entry_hints(writer::BytecodeWriter, sm_arch::Union{String, Nothing}, hints::EntryHints) validate_num_ctas(hints.num_ctas) validate_occupancy(hints.occupancy) @@ -603,9 +665,7 @@ function encode_entry_hints(writer::BytecodeWriter, sm_arch::Union{String, Nothi for (key, value) in items key_id = writer.string_table[key] encode_varint!(buf, key_id.id) - push!(buf, AttributeTag.Integer) - encode_typeid!(buf, I32(writer.type_table)) - encode_varint!(buf, UInt32(value)) + encode_tagged_value!(buf, writer.type_table, value) end return buf diff --git a/src/compiler/codegen/kernel.jl b/src/compiler/codegen/kernel.jl index 71a4f5c..81492b1 100644 --- a/src/compiler/codegen/kernel.jl +++ b/src/compiler/codegen/kernel.jl @@ -12,7 +12,7 @@ function emit_kernel!(writer::BytecodeWriter, func_buf::Vector{UInt8}, is_entry::Bool = true, num_ctas::Union{Int, Nothing} = nothing, occupancy::Union{Int, Nothing} = nothing) - ctx = CGCtx(writer, target) + ctx = CGCtx(writer, target, sm_arch) tt = ctx.tt # Validate non-ghost argument types are concrete diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl index 06a8f40..16c55da 100644 --- a/src/compiler/intrinsics.jl +++ b/src/compiler/intrinsics.jl @@ -21,6 +21,14 @@ end emit_intrinsic!(ctx::CGCtx, @nospecialize(func), args) = missing +# Shared helper for creating load/store optimization hints +function create_optimization_hints(ctx::CGCtx, latency::Union{Int, Nothing}, allow_tma::Bool=true) + isnothing(latency) && allow_tma && return nothing + isnothing(latency) || 1 <= latency <= 10 || error("latency must be between 1 and 10, got $latency") + hints = LoadStoreHints(; latency, allow_tma) + return make_load_store_hints(ctx.sm_arch, hints) +end + include("intrinsics/core.jl") include("intrinsics/conversions.jl") include("intrinsics/arithmetic.jl") diff --git a/src/compiler/intrinsics/memory.jl b/src/compiler/intrinsics/memory.jl index 4e7806d..1d7648a 100644 --- a/src/compiler/intrinsics/memory.jl +++ b/src/compiler/intrinsics/memory.jl @@ -5,16 +5,20 @@ # cuda_tile.load_ptr_tko @eval Intrinsics begin """ - load_ptr_tko(ptrs, mask=nothing, padding=nothing) + load_ptr_tko(ptrs, latency, mask=nothing, padding=nothing) Load values from a tile of pointers. If mask is provided, masked-out positions return the padding value. Compiled to cuda_tile.load_ptr_tko. + + Note: TMA (allow_tma) is not applicable for pointer-based loads as they + support irregular access patterns incompatible with TMA requirements. """ @noinline function load_ptr_tko(ptrs::Tile{Ptr{T}, S}, + latency::Union{Int, Nothing}=nothing, mask::Union{Tile{Bool, S}, Nothing}=nothing, padding::Union{Tile{T, S}, Nothing}=nothing) where {T, S} - donotdelete(ptrs, mask, padding) + donotdelete(ptrs, latency, mask, padding) Tile{T, S}() end end @@ -22,6 +26,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.load_ptr_tko), args) cb = ctx.cb tt = ctx.tt + # args: (ptrs, latency, mask?, padding?) # Get pointer tile (arg 1) ptrs_tv = emit_value!(ctx, args[1]) ptrs_tv === nothing && error("load_ptr_tko: cannot resolve pointer tile") @@ -36,17 +41,23 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.load_ptr_tko), args) result_tile_type = tile_type!(tt, dtype, tile_shape) token_type = Token(tt) - # Check if mask is provided (arg 2 is not nothing) - has_mask = length(args) >= 2 && get_constant(ctx, args[2]) !== nothing + # Extract latency hint (args[2]) + latency = get_constant(ctx, args[2]) + + # Create optimization hints if provided + optimization_hints = create_optimization_hints(ctx, latency) + + # Check if mask is provided (arg 3 is not nothing) + has_mask = length(args) >= 3 && get_constant(ctx, args[3]) !== nothing if has_mask - # Get mask tile (arg 2) - mask_tv = emit_value!(ctx, args[2]) + # Get mask tile (arg 3) + mask_tv = emit_value!(ctx, args[3]) mask_tv === nothing && error("load_ptr_tko: cannot resolve mask tile") mask = mask_tv.v - # Get padding tile (arg 3) - padding_tv = emit_value!(ctx, args[3]) + # Get padding tile (arg 4) + padding_tv = emit_value!(ctx, args[4]) padding_tv === nothing && error("load_ptr_tko: cannot resolve padding tile") padding = padding_tv.v @@ -54,11 +65,13 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.load_ptr_tko), args) tile_val, new_token = encode_LoadPtrTkoOp!(cb, result_tile_type, token_type, pointers; mask=mask, padding_value=padding, - token=ctx.token) + token=ctx.token, + optimization_hints) else # Load without mask tile_val, new_token = encode_LoadPtrTkoOp!(cb, result_tile_type, token_type, pointers; - token=ctx.token) + token=ctx.token, + optimization_hints) end ctx.token = new_token @@ -71,15 +84,19 @@ end # cuda_tile.store_ptr_tko @eval Intrinsics begin """ - store_ptr_tko(ptrs, values, mask=nothing) + store_ptr_tko(ptrs, values, latency, mask=nothing) Store values to a tile of pointers. If mask is provided, masked-out positions are not written. Compiled to cuda_tile.store_ptr_tko. + + Note: TMA (allow_tma) is not applicable for pointer-based stores as they + support irregular access patterns incompatible with TMA requirements. """ @noinline function store_ptr_tko(ptrs::Tile{Ptr{T}, S}, values::Tile{T, S}, + latency::Union{Int, Nothing}, mask::Union{Tile{Bool, S}, Nothing}=nothing) where {T, S} - donotdelete(ptrs, values, mask) + donotdelete(ptrs, values, latency, mask) nothing end end @@ -87,6 +104,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.store_ptr_tko), args) cb = ctx.cb tt = ctx.tt + # args: (ptrs, values, latency, mask?) # Get pointer tile (arg 1) ptrs_tv = emit_value!(ctx, args[1]) ptrs_tv === nothing && error("store_ptr_tko: cannot resolve pointer tile") @@ -99,23 +117,31 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.store_ptr_tko), args) token_type = Token(tt) - # Check if mask is provided (arg 3 is not nothing) - has_mask = length(args) >= 3 && get_constant(ctx, args[3]) !== nothing + # Extract latency hint (args[3]) + latency = get_constant(ctx, args[3]) + + # Create optimization hints if provided + optimization_hints = create_optimization_hints(ctx, latency) + + # Check if mask is provided (arg 4 is not nothing) + has_mask = length(args) >= 4 && get_constant(ctx, args[4]) !== nothing if has_mask - # Get mask tile (arg 3) - mask_tv = emit_value!(ctx, args[3]) + # Get mask tile (arg 4) + mask_tv = emit_value!(ctx, args[4]) mask_tv === nothing && error("store_ptr_tko: cannot resolve mask tile") mask = mask_tv.v # Store with mask new_token = encode_StorePtrTkoOp!(cb, token_type, pointers, values; mask=mask, - token=ctx.token) + token=ctx.token, + optimization_hints) else # Store without mask new_token = encode_StorePtrTkoOp!(cb, token_type, pointers, values; - token=ctx.token) + token=ctx.token, + optimization_hints) end ctx.token = new_token diff --git a/src/compiler/intrinsics/views.jl b/src/compiler/intrinsics/views.jl index 2773a30..0690bc9 100644 --- a/src/compiler/intrinsics/views.jl +++ b/src/compiler/intrinsics/views.jl @@ -1,8 +1,5 @@ # views -""" -Convert integer padding mode value to bytecode PaddingValue enum. -""" function padding_mode_to_padding_value(mode::Int) mode == 0 ? PaddingMissing : mode == 1 ? PaddingZero : @@ -83,13 +80,16 @@ end # cuda_tile.load_view_tko @eval Intrinsics begin """ - load_partition_view(pv::PartitionView, index...) -> Tile + load_partition_view(pv::PartitionView, latency, allow_tma, index...) -> Tile Load a tile from a partition view at the given 0-indexed tile coordinates. Compiled to cuda_tile.load_view_tko. """ - @noinline function load_partition_view(pv::PartitionView{T, N, Shape}, index::Vararg{Integer}) where {T, N, Shape} - donotdelete(pv) + @noinline function load_partition_view(pv::PartitionView{T, N, Shape}, + latency::Union{Int, Nothing}, + allow_tma::Bool, + index::Vararg{Integer}) where {T, N, Shape} + donotdelete(pv, latency, allow_tma) Tile{T, Shape}() end end @@ -97,7 +97,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.load_partition_view), a cb = ctx.cb tt = ctx.tt - # args: (partition_view, indices...) + # args: (partition_view, latency, allow_tma, indices...) pv_arg = emit_value!(ctx, args[1]) pv_arg === nothing && error("load_partition_view() requires a PartitionView argument") pv_arg.v === nothing && error("load_partition_view() requires a materialized PartitionView") @@ -115,10 +115,21 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.load_partition_view), a tile_type = tile_type!(tt, dtype, tile_shape) token_type = Token(tt) - # Extract indices from args[2:end] and infer index type + # Extract optimization hints (args[2] = latency, args[3] = allow_tma) + latency = get_constant(ctx, args[2]) + allow_tma = get_constant(ctx, args[3]) + + # Verify we got compile-time constants + if latency === nothing && allow_tma === nothing + error("load_partition_view(): latency and allow_tma must be compile-time constants") + end + # allow_tma defaults to true if not provided + allow_tma_val = allow_tma === nothing ? true : allow_tma::Bool + + # Extract indices from args[4:end] and infer index type index_vals = Value[] index_jl_types = Type[] - for i in 2:length(args) + for i in 4:length(args) tv = emit_value!(ctx, args[i]) tv === nothing && error("load_partition_view(): cannot resolve index argument") push!(index_vals, tv.v) @@ -133,8 +144,12 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.load_partition_view), a # Pad indices if needed index_vals = pad_indices(ctx, index_vals, ndim, index_type, index_jl_type) + # Create optimization hints if provided + optimization_hints = create_optimization_hints(ctx, latency, allow_tma_val) + # Load tile with token - tile_val, new_token = encode_LoadViewTkoOp!(cb, tile_type, token_type, pv_arg.v, index_vals; token=ctx.token) + tile_val, new_token = encode_LoadViewTkoOp!(cb, tile_type, token_type, pv_arg.v, index_vals; + token=ctx.token, optimization_hints) ctx.token = new_token CGVal(tile_val, tile_type, Tile{elem_type, Tuple(tile_shape)}, tile_shape) @@ -327,14 +342,17 @@ end # cuda_tile.store_view_tko @eval Intrinsics begin """ - store_partition_view(pv::PartitionView, tile, index...) -> Nothing + store_partition_view(pv::PartitionView, tile, latency, allow_tma, index...) -> Nothing Store a tile to a partition view at the given 0-indexed tile coordinates. Compiled to cuda_tile.store_view_tko. """ - @noinline function store_partition_view(pv::PartitionView{T, N, Shape}, tile::Tile{T, Shape}, index::Vararg{Integer}) where {T, N, Shape} - donotdelete(pv) - donotdelete(tile) + @noinline function store_partition_view(pv::PartitionView{T, N, Shape}, + tile::Tile{T, Shape}, + latency::Union{Int, Nothing}, + allow_tma::Bool, + index::Vararg{Integer}) where {T, N, Shape} + donotdelete(pv, tile, latency, allow_tma) nothing end end @@ -342,7 +360,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.store_partition_view), cb = ctx.cb tt = ctx.tt - # args: (partition_view, tile, indices...) + # args: (partition_view, tile, latency, allow_tma, indices...) pv_arg = emit_value!(ctx, args[1]) pv_arg === nothing && error("store_partition_view() requires a PartitionView argument") pv_arg.v === nothing && error("store_partition_view() requires a materialized PartitionView") @@ -371,10 +389,21 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.store_partition_view), tile_val = encode_ReshapeOp!(cb, tile_1d_type, tile_val) end - # Extract indices from args[3:end] and infer index type + # Extract optimization hints (args[3] = latency, args[4] = allow_tma) + latency = get_constant(ctx, args[3]) + allow_tma = get_constant(ctx, args[4]) + + # Verify we got compile-time constants + if latency === nothing && allow_tma === nothing + error("store_partition_view(): latency and allow_tma must be compile-time constants") + end + # allow_tma defaults to true if not provided + allow_tma_val = allow_tma === nothing ? true : allow_tma::Bool + + # Extract indices from args[5:end] and infer index type index_vals = Value[] index_jl_types = Type[] - for i in 3:length(args) + for i in 5:length(args) tv = emit_value!(ctx, args[i]) tv === nothing && error("store_partition_view(): cannot resolve index argument") push!(index_vals, tv.v) @@ -389,9 +418,13 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.store_partition_view), # Pad indices if needed index_vals = pad_indices(ctx, index_vals, actual_ndim, index_type, index_jl_type) + # Create optimization hints if provided + optimization_hints = create_optimization_hints(ctx, latency, allow_tma_val) + # Store tile with token token_type = Token(tt) - new_token = encode_StoreViewTkoOp!(cb, token_type, tile_val, pv_arg.v, index_vals; token=ctx.token) + new_token = encode_StoreViewTkoOp!(cb, token_type, tile_val, pv_arg.v, index_vals; + token=ctx.token, optimization_hints) ctx.token = new_token nothing diff --git a/src/compiler/target.jl b/src/compiler/target.jl index cc569b7..af26ce7 100644 --- a/src/compiler/target.jl +++ b/src/compiler/target.jl @@ -150,9 +150,12 @@ mutable struct CGCtx # Type cache: Julia type -> TypeId type_cache::Dict{Type, TypeId} + + # Target architecture (e.g., :sm_100) + sm_arch::Union{String, Nothing} end -function CGCtx(writer::BytecodeWriter, target::TileTarget) +function CGCtx(writer::BytecodeWriter, target::TileTarget, sm_arch::Union{String, Nothing}=nothing) CGCtx( Dict{Int, CGVal}(), Dict{Int, CGVal}(), @@ -167,6 +170,7 @@ function CGCtx(writer::BytecodeWriter, target::TileTarget) nothing, nothing, Dict{Type, TypeId}(), + sm_arch, ) end diff --git a/src/language/operations.jl b/src/language/operations.jl index bf20bb2..ace4b7a 100644 --- a/src/language/operations.jl +++ b/src/language/operations.jl @@ -60,7 +60,7 @@ Axis is 1-indexed. Equivalent to cld(arr.sizes[axis], shape[axis]). end """ - load(arr::TileArray, index, shape; padding_mode=PaddingMode.Undetermined) -> Tile + load(arr::TileArray, index, shape; padding_mode=PaddingMode.Undetermined, latency=nothing, allow_tma=true) -> Tile Load a tile from a TileArray at the given index with the specified shape. Index is 1-indexed. Shape must be compile-time constant. @@ -73,101 +73,131 @@ Index is 1-indexed. Shape must be compile-time constant. - `PaddingMode.PosInf`: Return positive infinity for OOB elements - `PaddingMode.NegInf`: Return negative infinity for OOB elements +# Optimization Hints +- `latency`: Optional latency hint (1-10), or nothing for compiler default +- `allow_tma`: Whether TMA (Tensor Memory Accelerator) is allowed (default: true) + # Example ```julia -tile = ct.load(arr, (bid,), (TILE_N[],); padding_mode=ct.PaddingMode.Zero) +tile = ct.load(arr, (bid,), (TILE_N[],); padding_mode=ct.PaddingMode.Zero, latency=3) ``` """ @inline function load(arr::TileArray{T, N}, index, shape::NTuple{<:Any, Int}; - padding_mode::Int=PaddingMode.Undetermined) where {T, N} + padding_mode::Int=PaddingMode.Undetermined, + latency::Union{Int, Nothing}=nothing, + allow_tma::Bool=true) where {T, N} tv = Intrinsics.make_tensor_view(arr) pv = Intrinsics.make_partition_view(tv, Val(shape), padding_mode) - Intrinsics.load_partition_view(pv, (promote(index...) .- One())...) + Intrinsics.load_partition_view(pv, latency, allow_tma, (promote(index...) .- One())...) end @inline function load(arr::TileArray{T, N}, index::Integer, shape::NTuple{<:Any, Int}; - padding_mode::Int=PaddingMode.Undetermined) where {T, N} + padding_mode::Int=PaddingMode.Undetermined, + latency::Union{Int, Nothing}=nothing, + allow_tma::Bool=true) where {T, N} tv = Intrinsics.make_tensor_view(arr) pv = Intrinsics.make_partition_view(tv, Val(shape), padding_mode) - Intrinsics.load_partition_view(pv, index - One()) + Intrinsics.load_partition_view(pv, latency, allow_tma, index - One()) end # Load with Constant shape tuple @inline function load(arr::TileArray{T, N}, index, shape::Tuple{Vararg{Constant{Int}}}; - padding_mode::Int=PaddingMode.Undetermined) where {T, N} + padding_mode::Int=PaddingMode.Undetermined, + latency::Union{Int, Nothing}=nothing, + allow_tma::Bool=true) where {T, N} shape_val = _extract_shape(shape) tv = Intrinsics.make_tensor_view(arr) pv = Intrinsics.make_partition_view(tv, Val(shape_val), padding_mode) - Intrinsics.load_partition_view(pv, (promote(index...) .- One())...) + Intrinsics.load_partition_view(pv, latency, allow_tma, (promote(index...) .- One())...) end # Keyword argument version @inline function load(arr::TileArray{T, N}; index, shape, - padding_mode::Int=PaddingMode.Undetermined) where {T, N} + padding_mode::Int=PaddingMode.Undetermined, + latency::Union{Int, Nothing}=nothing, + allow_tma::Bool=true) where {T, N} shape_val = _extract_shape(shape) tv = Intrinsics.make_tensor_view(arr) pv = Intrinsics.make_partition_view(tv, Val(shape_val), padding_mode) - Intrinsics.load_partition_view(pv, (promote(index...) .- One())...) + Intrinsics.load_partition_view(pv, latency, allow_tma, (promote(index...) .- One())...) end """ - store(arr::TileArray, index, tile::Tile) -> Tile + store(arr::TileArray, index, tile::Tile; latency=nothing, allow_tma=true) -> Tile Store a tile to a TileArray at the given index. Index is 1-indexed. Returns the stored tile (enables chaining and helps constant folding). + +# Optimization Hints +- `latency`: Optional latency hint (1-10), or nothing for compiler default +- `allow_tma`: Whether TMA (Tensor Memory Accelerator) is allowed (default: true) """ # Regular N-D tiles (N >= 1) -@inline function store(arr::TileArray{T}, index, tile::Tile{T, Shape}) where {T, Shape} +@inline function store(arr::TileArray{T}, index, tile::Tile{T, Shape}; + latency::Union{Int, Nothing}=nothing, + allow_tma::Bool=true) where {T, Shape} tv = Intrinsics.make_tensor_view(arr) pv = Intrinsics.make_partition_view(tv, Val(Shape), PaddingMode.Undetermined) - Intrinsics.store_partition_view(pv, tile, (promote(index...) .- One())...) + Intrinsics.store_partition_view(pv, tile, latency, allow_tma, (promote(index...) .- One())...) return tile # XXX: enables constant folding; remove when possible (see "constant folding" test) end -@inline function store(arr::TileArray{T}, index::Integer, tile::Tile{T, Shape}) where {T, Shape} +@inline function store(arr::TileArray{T}, index::Integer, tile::Tile{T, Shape}; + latency::Union{Int, Nothing}=nothing, + allow_tma::Bool=true) where {T, Shape} tv = Intrinsics.make_tensor_view(arr) pv = Intrinsics.make_partition_view(tv, Val(Shape), PaddingMode.Undetermined) - Intrinsics.store_partition_view(pv, tile, index - One()) + Intrinsics.store_partition_view(pv, tile, latency, allow_tma, index - One()) return tile # XXX: enables constant folding; remove when possible (see "constant folding" test) end # Special case for 0D (scalar) tiles - reshape to 1D for partition view -@inline function store(arr::TileArray{T}, index, tile::Tile{T, ()}) where {T} +@inline function store(arr::TileArray{T}, index, tile::Tile{T, ()}; + latency::Union{Int, Nothing}=nothing, + allow_tma::Bool=true) where {T} tv = Intrinsics.make_tensor_view(arr) # Reshape 0D tile to 1D (partition views require at least 1D) tile_1d = Intrinsics.reshape(tile, Val((1,))) pv = Intrinsics.make_partition_view(tv, Val((1,)), PaddingMode.Undetermined) - Intrinsics.store_partition_view(pv, tile_1d, (promote(index...) .- One())...) + Intrinsics.store_partition_view(pv, tile_1d, latency, allow_tma, (promote(index...) .- One())...) return tile # XXX: enables constant folding; remove when possible (see "constant folding" test) end -@inline function store(arr::TileArray{T}, index::Integer, tile::Tile{T, ()}) where {T} +@inline function store(arr::TileArray{T}, index::Integer, tile::Tile{T, ()}; + latency::Union{Int, Nothing}=nothing, + allow_tma::Bool=true) where {T} tv = Intrinsics.make_tensor_view(arr) tile_1d = Intrinsics.reshape(tile, Val((1,))) pv = Intrinsics.make_partition_view(tv, Val((1,)), PaddingMode.Undetermined) - Intrinsics.store_partition_view(pv, tile_1d, index - One()) + Intrinsics.store_partition_view(pv, tile_1d, latency, allow_tma, index - One()) return tile # XXX: enables constant folding; remove when possible (see "constant folding" test) end # Keyword argument version - dispatch to positional version -@inline function store(arr::TileArray{T}; index, tile::Tile{T, Shape}) where {T, Shape} - store(arr, index, tile) +@inline function store(arr::TileArray{T}; index, tile::Tile{T, Shape}, + latency::Union{Int, Nothing}=nothing, + allow_tma::Bool=true) where {T, Shape} + store(arr, index, tile; latency, allow_tma) end """ - gather(array::TileArray{T, 1}, indices::Tile{I, S}) -> Tile{T, S} + gather(array::TileArray{T, 1}, indices::Tile{I, S}; latency=nothing) -> Tile{T, S} Gather elements from a 1D array using index tile. Indices are 1-indexed. Out-of-bounds indices return zero. +# Optimization Hints +- `latency`: Optional latency hint (1-10), or nothing for compiler default + # Example ```julia base = (bid - 1) * TILE indices = base .+ ct.arange((TILE,), Int32) -tile = ct.gather(arr, indices) +tile = ct.gather(arr, indices; latency=3) ``` """ -@inline function gather(array::TileArray{T, 1}, indices::Tile{I, S}) where {T, I <: Integer, S} +@inline function gather(array::TileArray{T, 1}, indices::Tile{I, S}; + latency::Union{Int, Nothing}=nothing) where {T, I <: Integer, S} # Convert to 0-indexed indices_0 = indices .- one(I) @@ -187,16 +217,20 @@ tile = ct.gather(arr, indices) # Padding for OOB (zero) padding = broadcast_to(Tile(zero(T)), S) - Intrinsics.load_ptr_tko(ptr_tile, mask, padding) + Intrinsics.load_ptr_tko(ptr_tile, latency, mask, padding) end """ - gather(array::TileArray{T, 2}, indices::Tuple{Tile, Tile}) -> Tile{T, S} + gather(array::TileArray{T, 2}, indices::Tuple{Tile, Tile}; latency=nothing) -> Tile{T, S} Gather elements from a 2D array using a tuple of index tiles. Indices are 1-indexed. Index tiles are broadcast to a common shape. + +# Optimization Hints +- `latency`: Optional latency hint (1-10), or nothing for compiler default """ -@inline function gather(array::TileArray{T, 2}, indices::Tuple{Tile{I0, S0}, Tile{I1, S1}}) where {T, I0 <: Integer, I1 <: Integer, S0, S1} +@inline function gather(array::TileArray{T, 2}, indices::Tuple{Tile{I0, S0}, Tile{I1, S1}}; + latency::Union{Int, Nothing}=nothing) where {T, I0 <: Integer, I1 <: Integer, S0, S1} # Convert to 0-indexed idx0_0 = indices[1] .- one(I0) idx1_0 = indices[2] .- one(I1) @@ -235,23 +269,27 @@ Indices are 1-indexed. Index tiles are broadcast to a common shape. # Padding for OOB (zero) padding = broadcast_to(Tile(zero(T)), S) - Intrinsics.load_ptr_tko(ptr_tile, mask, padding) + Intrinsics.load_ptr_tko(ptr_tile, latency, mask, padding) end """ - scatter(array::TileArray{T, 1}, indices::Tile{I, S}, tile::Tile{T, S}) -> Nothing + scatter(array::TileArray{T, 1}, indices::Tile{I, S}, tile::Tile{T, S}; latency=nothing) -> Nothing Scatter elements to a 1D array at index tile positions. Indices are 1-indexed. Out-of-bounds indices are ignored. +# Optimization Hints +- `latency`: Optional latency hint (1-10), or nothing for compiler default + # Example ```julia base = (bid - 1) * TILE indices = base .+ ct.arange((TILE,), Int32) -ct.scatter(arr, indices, result_tile) +ct.scatter(arr, indices, result_tile; latency=3) ``` """ -@inline function scatter(array::TileArray{T, 1}, indices::Tile{I, S}, tile::Tile{T, S}) where {T, I <: Integer, S} +@inline function scatter(array::TileArray{T, 1}, indices::Tile{I, S}, tile::Tile{T, S}; + latency::Union{Int, Nothing}=nothing) where {T, I <: Integer, S} # Convert to 0-indexed indices_0 = indices .- one(I) @@ -268,16 +306,20 @@ ct.scatter(arr, indices, result_tile) lt_size = indices_i32 .< size_0d mask = ge_zero .& lt_size - Intrinsics.store_ptr_tko(ptr_tile, tile, mask) + Intrinsics.store_ptr_tko(ptr_tile, tile, latency, mask) end """ - scatter(array::TileArray{T, 2}, indices::Tuple{Tile, Tile}, tile::Tile) -> Nothing + scatter(array::TileArray{T, 2}, indices::Tuple{Tile, Tile}, tile::Tile; latency=nothing) -> Nothing Scatter elements to a 2D array at index tile positions. Indices are 1-indexed. Index tiles and value tile must broadcast to same shape. + +# Optimization Hints +- `latency`: Optional latency hint (1-10), or nothing for compiler default """ -@inline function scatter(array::TileArray{T, 2}, indices::Tuple{Tile{I0, S0}, Tile{I1, S1}}, tile::Tile{T, Stile}) where {T, I0 <: Integer, I1 <: Integer, S0, S1, Stile} +@inline function scatter(array::TileArray{T, 2}, indices::Tuple{Tile{I0, S0}, Tile{I1, S1}}, tile::Tile{T, Stile}; + latency::Union{Int, Nothing}=nothing) where {T, I0 <: Integer, I1 <: Integer, S0, S1, Stile} # Convert to 0-indexed idx0_0 = indices[1] .- one(I0) idx1_0 = indices[2] .- one(I1) @@ -314,7 +356,7 @@ Indices are 1-indexed. Index tiles and value tile must broadcast to same shape. mask1 = (idx1_i32 .>= zero_bc) .& (idx1_i32 .< size1_bc) mask = mask0 .& mask1 - Intrinsics.store_ptr_tko(ptr_tile, tile_bc, mask) + Intrinsics.store_ptr_tko(ptr_tile, tile_bc, latency, mask) end #============================================================================= diff --git a/test/codegen.jl b/test/codegen.jl index 7e04e27..ae4b42e 100644 --- a/test/codegen.jl +++ b/test/codegen.jl @@ -1848,7 +1848,7 @@ end end #============================================================================= - Entry Hints (optimization_hints attribute) + Entry Hints (kernel-level optimization hints) =============================================================================# @testset "Entry Hints" begin @@ -1961,3 +1961,161 @@ end @test !isempty(bytecode32) end end + +#============================================================================= + Load / Store Hints (operation-level optimization hints) +=============================================================================# + +@testset "Load / Store Optimization Hints" begin + # Common ArraySpecs for tests + spec1d = ct.ArraySpec{1}(16, true) + + @testset "latency only on load" begin + @test @filecheck begin + @check "load_view_tko" + @check "optimization_hints = " + ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a + pid = ct.bid(1) + t = ct.load(a, pid, (16,); latency=5) + ct.store(a, pid, t) + return nothing + end + end + end + + @testset "allow_tma=false only on load" begin + @test @filecheck begin + @check "load_view_tko" + @check "optimization_hints = " + ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a + pid = ct.bid(1) + t = ct.load(a, pid, (16,); allow_tma=false) + ct.store(a, pid, t) + return nothing + end + end + end + + @testset "both hints on load" begin + @test @filecheck begin + @check "load_view_tko" + @check "optimization_hints = " + ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a + pid = ct.bid(1) + t = ct.load(a, pid, (16,); latency=7, allow_tma=false) + ct.store(a, pid, t) + return nothing + end + end + end + + @testset "latency only on store" begin + @test @filecheck begin + @check "store_view_tko" + @check "optimization_hints = " + ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a + pid = ct.bid(1) + t = ct.load(a, pid, (16,)) + ct.store(a, pid, t; latency=3) + return nothing + end + end + end + + @testset "allow_tma=false only on store" begin + @test @filecheck begin + @check "store_view_tko" + @check "optimization_hints = " + ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a + pid = ct.bid(1) + t = ct.load(a, pid, (16,)) + ct.store(a, pid, t; allow_tma=false) + return nothing + end + end + end + + @testset "both hints on store" begin + @test @filecheck begin + @check "store_view_tko" + @check "optimization_hints = " + ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a + pid = ct.bid(1) + t = ct.load(a, pid, (16,)) + ct.store(a, pid, t; allow_tma=false, latency=2) + return nothing + end + end + end + + @testset "latency validation" begin + @test_throws "latency must be between 1 and 10" begin + code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a + pid = ct.bid(1) + ct.load(a, pid, (16,); latency=11) + end + end + + bytecode1 = code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a + pid = ct.bid(1) + t = ct.load(a, pid, (16,); latency=8) + ct.store(a, pid, t) + return nothing + end + @test !isempty(bytecode1) + end + + @testset "multiple operations with mixed hints" begin + @test @filecheck begin + # First load with latency + @check "load_view_tko" + @check "optimization_hints = " + # Second load with allow_tma=false + @check "load_view_tko" + @check "optimization_hints = " + # Third load with no hints + @check "load_view_tko" + @check_not "optimization_hints" + ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}, + ct.TileArray{Float32, 1, spec1d}, + ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a, b, c + pid = ct.bid(1) + t1 = ct.load(a, pid, (16,); latency=5) + t2 = ct.load(b, pid, (16,); allow_tma=false) + t3 = ct.load(c, pid, (16,)) + result = t1 + t2 + t3 + ct.store(a, pid, result) + return nothing + end + end + end + + # Pointer-based operations (gather/scatter) with latency hints + @testset "gather with latency hint" begin + @test @filecheck begin + @check "load_ptr_tko" + @check "optimization_hints = " + code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}, ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a, b + pid = ct.bid(1) + indices = ct.arange((16,), Int32) + tile = ct.gather(a, indices; latency=3) + ct.store(b, pid, tile) + return nothing + end + end + end + + @testset "scatter with latency hint" begin + @test @filecheck begin + @check "store_ptr_tko" + @check "optimization_hints = " + code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}, ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a, b + pid = ct.bid(1) + tile = ct.load(a, pid, (16,)) + indices = ct.arange((16,), Int32) + ct.scatter(b, indices, tile; latency=5) + return nothing + end + end + end +end diff --git a/test/execution.jl b/test/execution.jl index 44e5114..8297a9d 100644 --- a/test/execution.jl +++ b/test/execution.jl @@ -1590,7 +1590,7 @@ end end -@testset "Entry Hints Integration" begin +@testset "Entry Hints" begin @testset "launch with num_ctas" begin function vadd_kernel_num_ctas(a::ct.TileArray{Float32,1}, @@ -1656,3 +1656,346 @@ end end end + +@testset "Load / Store Optimization Hints" begin + +@testset "load with latency hint" begin + function vadd_with_load_latency(a::ct.TileArray{Float32,1}, + b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,); latency=5) + tb = ct.load(b, pid, (16,); latency=3) + ct.store(c, pid, ta + tb) + return nothing + end + + n = 1024 + a = CUDA.ones(Float32, n) + b = CUDA.ones(Float32, n) .* 2 + c = CUDA.zeros(Float32, n) + + ct.launch(vadd_with_load_latency, 64, a, b, c) + CUDA.synchronize() + @test Array(c) ≈ ones(Float32, n) .* 3 +end + +@testset "load with allow_tma=false" begin + function vadd_no_tma(a::ct.TileArray{Float32,1}, + b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,); allow_tma=false) + tb = ct.load(b, pid, (16,); allow_tma=false) + ct.store(c, pid, ta + tb) + return nothing + end + + n = 1024 + a = CUDA.ones(Float32, n) + b = CUDA.ones(Float32, n) .* 2 + c = CUDA.zeros(Float32, n) + + ct.launch(vadd_no_tma, 64, a, b, c) + CUDA.synchronize() + @test Array(c) ≈ ones(Float32, n) .* 3 +end + +@testset "load with both hints" begin + function vadd_both_load_hints(a::ct.TileArray{Float32,1}, + b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,); latency=7, allow_tma=false) + tb = ct.load(b, pid, (16,); latency=4, allow_tma=true) + ct.store(c, pid, ta + tb) + return nothing + end + + n = 1024 + a = CUDA.ones(Float32, n) + b = CUDA.ones(Float32, n) .* 2 + c = CUDA.zeros(Float32, n) + + ct.launch(vadd_both_load_hints, 64, a, b, c) + CUDA.synchronize() + @test Array(c) ≈ ones(Float32, n) .* 3 +end + +@testset "store with latency hint" begin + function copy_with_store_latency(a::ct.TileArray{Float32,1}, + b::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + ct.store(b, pid, ta; latency=2) + return nothing + end + + n = 1024 + a = CUDA.rand(Float32, n) + b = CUDA.zeros(Float32, n) + + ct.launch(copy_with_store_latency, 64, a, b) + CUDA.synchronize() + @test Array(b) ≈ Array(a) +end + +@testset "store with allow_tma=false" begin + function copy_no_tma_store(a::ct.TileArray{Float32,1}, + b::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + ct.store(b, pid, ta; allow_tma=false) + return nothing + end + + n = 1024 + a = CUDA.rand(Float32, n) + b = CUDA.zeros(Float32, n) + + ct.launch(copy_no_tma_store, 64, a, b) + CUDA.synchronize() + @test Array(b) ≈ Array(a) +end + +@testset "different hints on load and store" begin + function vadd_mixed_hints(a::ct.TileArray{Float32,1}, + b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + # Load with high latency, no TMA + ta = ct.load(a, pid, (16,); latency=8, allow_tma=false) + tb = ct.load(b, pid, (16,); latency=6, allow_tma=false) + # Store with low latency, allow TMA + ct.store(c, pid, ta + tb; latency=2, allow_tma=true) + return nothing + end + + n = 1024 + a = CUDA.ones(Float32, n) + b = CUDA.ones(Float32, n) .* 2 + c = CUDA.zeros(Float32, n) + + ct.launch(vadd_mixed_hints, 64, a, b, c) + CUDA.synchronize() + @test Array(c) ≈ ones(Float32, n) .* 3 +end + +@testset "2D matmul with hints" begin + function matmul_with_hints(a::ct.TileArray{Float32,2}, + b::ct.TileArray{Float32,2}, + c::ct.TileArray{Float32,2}) + bidx = ct.bid(1) + bidy = ct.bid(2) + # Load with latency hints + tile_a = ct.load(a, (bidx, 1), (32, 16); latency=5) + tile_b = ct.load(b, (1, bidy), (16, 32); latency=5) + result = tile_a * tile_b + # Store with latency hint + ct.store(c, (bidx, bidy), result; latency=3) + return nothing + end + + M, K, N = 64, 16, 64 + a = CUDA.rand(Float32, M, K) + b = CUDA.rand(Float32, K, N) + c = CUDA.zeros(Float32, M, N) + + grid_x = cld(M, 32) + grid_y = cld(N, 32) + ct.launch(matmul_with_hints, (grid_x, grid_y, 1), a, b, c) + CUDA.synchronize() + + # Verify against CPU reference + a_cpu = Array(a) + b_cpu = Array(b) + c_cpu = Array(c) + c_ref = a_cpu * b_cpu + + @test c_cpu ≈ c_ref rtol=1e-5 +end + +@testset "reduction with hints" begin + function reduce_with_hints(a::ct.TileArray{Float32,2}, + b::ct.TileArray{Float32,1}) + pid = ct.bid(1) + # Load with hints + tile = ct.load(a, (pid, 1), (1, 128); latency=6, allow_tma=false) + sums = ct.reduce_sum(tile, 2) + # Store with hints + ct.store(b, pid, sums; latency=2) + return nothing + end + + m, n = 64, 128 + a = CUDA.rand(Float32, m, n) + b = CUDA.zeros(Float32, m) + + ct.launch(reduce_with_hints, m, a, b) + CUDA.synchronize() + + # Each row should be summed + a_cpu = Array(a) + b_cpu = Array(b) + for i in 1:m + @test b_cpu[i] ≈ sum(a_cpu[i, :]) rtol=1e-3 + end +end + +@testset "transpose with hints" begin + function transpose_with_hints(x::ct.TileArray{Float32,2}, + y::ct.TileArray{Float32,2}) + bidx = ct.bid(1) + bidy = ct.bid(2) + # Load with high latency + tile = ct.load(x, (bidx, bidy), (32, 32); latency=9) + transposed = ct.transpose(tile) + # Store with lower latency + ct.store(y, (bidy, bidx), transposed; latency=4) + return nothing + end + + m, n = 256, 128 + tile_size = 32 + x = CUDA.rand(Float32, m, n) + y = CUDA.zeros(Float32, n, m) + + ct.launch(transpose_with_hints, (cld(m, tile_size), cld(n, tile_size)), x, y) + CUDA.synchronize() + + @test Array(y) ≈ transpose(Array(x)) +end + +@testset "complex kernel with multiple loads/stores with hints" begin + function complex_hints_kernel(a::ct.TileArray{Float32,1}, + b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}, + d::ct.TileArray{Float32,1}) + pid = ct.bid(1) + # Multiple loads with different hints + ta = ct.load(a, pid, (16,); latency=10, allow_tma=false) + tb = ct.load(b, pid, (16,); latency=5, allow_tma=true) + tc = ct.load(c, pid, (16,); latency=7) + + # Compute result + result = ta + tb + tc + + # Store with hint + ct.store(d, pid, result; latency=1, allow_tma=false) + return nothing + end + + n = 1024 + a = CUDA.ones(Float32, n) + b = CUDA.ones(Float32, n) .* 2 + c = CUDA.ones(Float32, n) .* 3 + d = CUDA.zeros(Float32, n) + + ct.launch(complex_hints_kernel, 64, a, b, c, d) + CUDA.synchronize() + @test Array(d) ≈ ones(Float32, n) .* 6 +end + +@testset "hints with Float64" begin + function vadd_f64_hints(a::ct.TileArray{Float64,1}, + b::ct.TileArray{Float64,1}, + c::ct.TileArray{Float64,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,); latency=8) + tb = ct.load(b, pid, (16,); latency=8) + ct.store(c, pid, ta + tb; latency=4) + return nothing + end + + n = 1024 + a = CUDA.rand(Float64, n) + b = CUDA.rand(Float64, n) + c = CUDA.zeros(Float64, n) + + ct.launch(vadd_f64_hints, 64, a, b, c) + CUDA.synchronize() + @test Array(c) ≈ Array(a) + Array(b) +end + +@testset "hints with Float16" begin + function vadd_f16_hints(a::ct.TileArray{Float16,1}, + b::ct.TileArray{Float16,1}, + c::ct.TileArray{Float16,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,); latency=3, allow_tma=false) + tb = ct.load(b, pid, (16,); latency=3, allow_tma=false) + ct.store(c, pid, ta + tb; latency=1) + return nothing + end + + n = 1024 + a = CUDA.rand(Float16, n) + b = CUDA.rand(Float16, n) + c = CUDA.zeros(Float16, n) + + ct.launch(vadd_f16_hints, 64, a, b, c) + CUDA.synchronize() + @test Array(c) ≈ Array(a) + Array(b) +end + +@testset "boundary latency values" begin + function test_boundary_latency(a::ct.TileArray{Float32,1}, + b::ct.TileArray{Float32,1}) + pid = ct.bid(1) + # Min and max valid latency values + ta = ct.load(a, pid, (16,); latency=1) + ct.store(b, pid, ta; latency=10) + return nothing + end + + n = 1024 + a = CUDA.rand(Float32, n) + b = CUDA.zeros(Float32, n) + + ct.launch(test_boundary_latency, 64, a, b) + CUDA.synchronize() + @test Array(b) ≈ Array(a) +end + +# Pointer-based operations (gather/scatter) with latency hints +@testset "gather with latency hint" begin + function gather_with_latency(a::ct.TileArray{Float32,1}, + b::ct.TileArray{Float32,1}) + pid = ct.bid(1) + base = (pid - 1) * 16 + indices = base .+ ct.arange((16,), Int32) + tile = ct.gather(a, indices; latency=5) + ct.store(b, pid, tile) + return nothing + end + + n = 1024 + a = CUDA.rand(Float32, n) + b = CUDA.zeros(Float32, n) + + ct.launch(gather_with_latency, 64, a, b) + CUDA.synchronize() + @test Array(b) ≈ Array(a) +end + +@testset "scatter with latency hint" begin + function scatter_with_latency(a::ct.TileArray{Float32,1}, + b::ct.TileArray{Float32,1}) + pid = ct.bid(1) + tile = ct.load(a, pid, (16,)) + base = (pid - 1) * 16 + indices = base .+ ct.arange((16,), Int32) + ct.scatter(b, indices, tile; latency=3) + return nothing + end + + n = 1024 + a = CUDA.rand(Float32, n) + b = CUDA.zeros(Float32, n) + + ct.launch(scatter_with_latency, 64, a, b) + CUDA.synchronize() + @test Array(b) ≈ Array(a) +end + +end From 24a25bb97daa69250a986f13e38cb04f08e7b098 Mon Sep 17 00:00:00 2001 From: AntonOresten Date: Wed, 14 Jan 2026 09:28:17 +0100 Subject: [PATCH 2/2] docstring and comment fixes --- src/bytecode/writer.jl | 2 +- src/compiler/intrinsics/views.jl | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl index 67c3df1..eb87585 100644 --- a/src/bytecode/writer.jl +++ b/src/bytecode/writer.jl @@ -544,7 +544,7 @@ function finalize_function!(func_buf::Vector{UInt8}, cb::CodeBuilder, end #============================================================================= - Optimization and Entry Hints + Optimization Hints =============================================================================# """ diff --git a/src/compiler/intrinsics/views.jl b/src/compiler/intrinsics/views.jl index 0690bc9..47986a9 100644 --- a/src/compiler/intrinsics/views.jl +++ b/src/compiler/intrinsics/views.jl @@ -1,5 +1,8 @@ # views +""" +Convert integer padding mode value to bytecode PaddingValue enum. +""" function padding_mode_to_padding_value(mode::Int) mode == 0 ? PaddingMissing : mode == 1 ? PaddingZero :