From 5a603b2a7f9d97b5c0abda98774eef71c571ce80 Mon Sep 17 00:00:00 2001
From: AntonOresten <antonoresten@proton.me>
Date: Wed, 14 Jan 2026 01:11:14 +0100
Subject: [PATCH 1/2] expose load/store/gather/scatter optimization hints

---
 src/bytecode/encodings.jl         |  16 +-
 src/bytecode/writer.jl            |  76 ++++++-
 src/compiler/codegen/kernel.jl    |   2 +-
 src/compiler/intrinsics.jl        |   8 +
 src/compiler/intrinsics/memory.jl |  62 ++++--
 src/compiler/intrinsics/views.jl  |  69 ++++--
 src/compiler/target.jl            |   6 +-
 src/language/operations.jl        | 112 +++++++---
 test/codegen.jl                   | 160 +++++++++++++-
 test/execution.jl                 | 345 +++++++++++++++++++++++++++++-
 10 files changed, 765 insertions(+), 91 deletions(-)

diff --git a/src/bytecode/encodings.jl b/src/bytecode/encodings.jl
index 9305bee..9f06415 100644
--- a/src/bytecode/encodings.jl
+++ b/src/bytecode/encodings.jl
@@ -423,7 +423,7 @@ function encode_LoadViewTkoOp!(cb::CodeBuilder,
                                token::Union{Value, Nothing}=nothing,
                                memory_ordering::MemoryOrderingSemantics=MemoryWeak,
                                memory_scope::Union{MemoryScope, Nothing}=nothing,
-                               optimization_hints::Union{Vector{UInt8}, Nothing}=nothing)
+                               optimization_hints::Union{OptimizationHints, Nothing}=nothing)
     encode_varint!(cb.buf, Opcode.LoadViewTkoOp)
     # Variadic result types
     encode_typeid_seq!(cb.buf, [tile_type, token_type])
@@ -447,7 +447,7 @@ function encode_LoadViewTkoOp!(cb::CodeBuilder,
         encode_enum!(cb.buf, memory_scope)
     end
     if optimization_hints !== nothing
-        append!(cb.buf, optimization_hints)
+        encode_opattr_optimization_hints!(cb, optimization_hints)
     end
 
     # Operands
@@ -472,7 +472,7 @@ function encode_StoreViewTkoOp!(cb::CodeBuilder,
                                 token::Union{Value, Nothing}=nothing,
                                 memory_ordering::MemoryOrderingSemantics=MemoryWeak,
                                 memory_scope::Union{MemoryScope, Nothing}=nothing,
-                                optimization_hints::Union{Vector{UInt8}, Nothing}=nothing)
+                                optimization_hints::Union{OptimizationHints, Nothing}=nothing)
     encode_varint!(cb.buf, Opcode.StoreViewTkoOp)
     # Variadic result types (just token)
     encode_typeid_seq!(cb.buf, [token_type])
@@ -496,7 +496,7 @@ function encode_StoreViewTkoOp!(cb::CodeBuilder,
         encode_enum!(cb.buf, memory_scope)
     end
     if optimization_hints !== nothing
-        append!(cb.buf, optimization_hints)
+        encode_opattr_optimization_hints!(cb, optimization_hints)
     end
 
     # Operands
@@ -541,7 +541,7 @@ function encode_LoadPtrTkoOp!(cb::CodeBuilder,
                               token::Union{Value, Nothing}=nothing,
                               memory_ordering::MemoryOrderingSemantics=MemoryWeak,
                               memory_scope::Union{MemoryScope, Nothing}=nothing,
-                              optimization_hints::Union{Vector{UInt8}, Nothing}=nothing)
+                              optimization_hints::Union{OptimizationHints, Nothing}=nothing)
     encode_varint!(cb.buf, Opcode.LoadPtrTkoOp)
     # Result types
     encode_typeid!(cb.buf, result_type)
@@ -572,7 +572,7 @@ function encode_LoadPtrTkoOp!(cb::CodeBuilder,
         encode_enum!(cb.buf, memory_scope)
     end
     if optimization_hints !== nothing
-        append!(cb.buf, optimization_hints)
+        encode_opattr_optimization_hints!(cb, optimization_hints)
     end
 
     # Operands
@@ -600,7 +600,7 @@ function encode_StorePtrTkoOp!(cb::CodeBuilder,
                                token::Union{Value, Nothing}=nothing,
                                memory_ordering::MemoryOrderingSemantics=MemoryWeak,
                                memory_scope::Union{MemoryScope, Nothing}=nothing,
-                               optimization_hints::Union{Vector{UInt8}, Nothing}=nothing)
+                               optimization_hints::Union{OptimizationHints, Nothing}=nothing)
     encode_varint!(cb.buf, Opcode.StorePtrTkoOp)
     # Result type (token)
     encode_typeid!(cb.buf, token_type)
@@ -627,7 +627,7 @@ function encode_StorePtrTkoOp!(cb::CodeBuilder,
         encode_enum!(cb.buf, memory_scope)
     end
     if optimization_hints !== nothing
-        append!(cb.buf, optimization_hints)
+        encode_opattr_optimization_hints!(cb, optimization_hints)
     end
 
     # Operands
diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index bd2bc5d..67c3df1 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -544,9 +544,75 @@ function finalize_function!(func_buf::Vector{UInt8}, cb::CodeBuilder,
 end
 
 #=============================================================================
- EntryHints: Kernel-level compilation hints
+ Optimization and Entry Hints 
 =============================================================================#
 
+"""
+    encode_tagged_value!(cb, value)
+
+Encode a value with its type tag.
+"""
+function encode_tagged_value!(buf::Vector{UInt8}, type_table::TypeTable, value::Bool)
+    push!(buf, AttributeTag.Bool)
+    push!(buf, value)
+end
+
+function encode_tagged_value!(buf::Vector{UInt8}, type_table::TypeTable, value::Integer)
+    push!(buf, AttributeTag.Integer)
+    encode_typeid!(buf, I32(type_table))
+    encode_varint!(buf, UInt32(value))
+end
+
+"""
+Optimization hints for load/store operations.
+- `latency`: Optional latency hint (1-10), or nothing for default
+- `allow_tma`: Whether TMA (Tensor Memory Accelerator) is allowed (default: true)
+"""
+@kwdef struct LoadStoreHints
+    latency::Union{Int, Nothing} = nothing
+    allow_tma::Bool = true
+end
+
+"""
+Optimization hints for load/store operations.
+- `hints_by_arch`: List of (SM architecture, load/store hints) pairs
+"""
+struct OptimizationHints
+    hints_by_arch::Vector{Tuple{String, LoadStoreHints}}
+end
+
+function make_load_store_hints(sm_arch::Union{String, Nothing}, hints::LoadStoreHints)
+    isnothing(sm_arch) && throw(ArgumentError("sm_arch must be explicitly passed when load/store hints are present"))
+    OptimizationHints([(sm_arch, hints)])
+end
+
+function encode_opattr_optimization_hints!(cb::CodeBuilder, hints::OptimizationHints)
+    # Outer dictionary: arch -> hints_dict
+    encode_varint!(cb.buf, length(hints.hints_by_arch))
+    for (arch, load_store_hints) in hints.hints_by_arch
+        arch_id = cb.string_table[arch]
+        encode_varint!(cb.buf, arch_id.id)
+        # Encode hints as inner dictionary (tagged)
+        encode_load_store_hints_dict!(cb, load_store_hints)
+    end
+end
+
+function encode_load_store_hints_dict!(cb::CodeBuilder, hints::LoadStoreHints)
+    # Build list of (key, value) pairs for non-default hints
+    items = Tuple{String, Any}[]
+    hints.allow_tma || push!(items, ("allow_tma", false))
+    isnothing(hints.latency) || push!(items, ("latency", hints.latency))
+
+    # Encode dictionary
+    push!(cb.buf, AttributeTag.Dictionary)
+    encode_varint!(cb.buf, length(items))
+    for (key, value) in items
+        key_id = cb.string_table[key]
+        encode_varint!(cb.buf, key_id.id)
+        encode_tagged_value!(cb.buf, cb.type_table, value)
+    end
+end
+
 """
 Kernel-level compilation hints (num_ctas, occupancy).
 Encoded as a dictionary attribute in bytecode.
@@ -567,10 +633,6 @@ function validate_occupancy(occupancy::Union{Int, Nothing})
     1 <= occupancy <= 32 || throw(ArgumentError("occupancy must be between 1 and 32, got $occupancy"))
 end
 
-"""
-Encode EntryHints as OptimizationHints format.
-Returns raw bytes for entry_hints parameter or nothing.
-"""
 function encode_entry_hints(writer::BytecodeWriter, sm_arch::Union{String, Nothing}, hints::EntryHints)
     validate_num_ctas(hints.num_ctas)
     validate_occupancy(hints.occupancy)
@@ -603,9 +665,7 @@ function encode_entry_hints(writer::BytecodeWriter, sm_arch::Union{String, Nothi
     for (key, value) in items
         key_id = writer.string_table[key]
         encode_varint!(buf, key_id.id)
-        push!(buf, AttributeTag.Integer)
-        encode_typeid!(buf, I32(writer.type_table))
-        encode_varint!(buf, UInt32(value))
+        encode_tagged_value!(buf, writer.type_table, value)
     end
 
     return buf
diff --git a/src/compiler/codegen/kernel.jl b/src/compiler/codegen/kernel.jl
index 71a4f5c..81492b1 100644
--- a/src/compiler/codegen/kernel.jl
+++ b/src/compiler/codegen/kernel.jl
@@ -12,7 +12,7 @@ function emit_kernel!(writer::BytecodeWriter, func_buf::Vector{UInt8},
                       is_entry::Bool = true,
                       num_ctas::Union{Int, Nothing} = nothing,
                       occupancy::Union{Int, Nothing} = nothing)
-    ctx = CGCtx(writer, target)
+    ctx = CGCtx(writer, target, sm_arch)
     tt = ctx.tt
 
     # Validate non-ghost argument types are concrete
diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl
index 06a8f40..16c55da 100644
--- a/src/compiler/intrinsics.jl
+++ b/src/compiler/intrinsics.jl
@@ -21,6 +21,14 @@ end
 
 emit_intrinsic!(ctx::CGCtx, @nospecialize(func), args) = missing
 
+# Shared helper for creating load/store optimization hints
+function create_optimization_hints(ctx::CGCtx, latency::Union{Int, Nothing}, allow_tma::Bool=true)
+    isnothing(latency) && allow_tma && return nothing
+    isnothing(latency) || 1 <= latency <= 10 || error("latency must be between 1 and 10, got $latency")
+    hints = LoadStoreHints(; latency, allow_tma)
+    return make_load_store_hints(ctx.sm_arch, hints)
+end
+
 include("intrinsics/core.jl")
 include("intrinsics/conversions.jl")
 include("intrinsics/arithmetic.jl")
diff --git a/src/compiler/intrinsics/memory.jl b/src/compiler/intrinsics/memory.jl
index 4e7806d..1d7648a 100644
--- a/src/compiler/intrinsics/memory.jl
+++ b/src/compiler/intrinsics/memory.jl
@@ -5,16 +5,20 @@
 # cuda_tile.load_ptr_tko
 @eval Intrinsics begin
     """
-        load_ptr_tko(ptrs, mask=nothing, padding=nothing)
+        load_ptr_tko(ptrs, latency, mask=nothing, padding=nothing)
 
     Load values from a tile of pointers.
     If mask is provided, masked-out positions return the padding value.
     Compiled to cuda_tile.load_ptr_tko.
+
+    Note: TMA (allow_tma) is not applicable for pointer-based loads as they
+    support irregular access patterns incompatible with TMA requirements.
     """
     @noinline function load_ptr_tko(ptrs::Tile{Ptr{T}, S},
+                                     latency::Union{Int, Nothing}=nothing,
                                      mask::Union{Tile{Bool, S}, Nothing}=nothing,
                                      padding::Union{Tile{T, S}, Nothing}=nothing) where {T, S}
-        donotdelete(ptrs, mask, padding)
+        donotdelete(ptrs, latency, mask, padding)
         Tile{T, S}()
     end
 end
@@ -22,6 +26,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.load_ptr_tko), args)
     cb = ctx.cb
     tt = ctx.tt
 
+    # args: (ptrs, latency, mask?, padding?)
     # Get pointer tile (arg 1)
     ptrs_tv = emit_value!(ctx, args[1])
     ptrs_tv === nothing && error("load_ptr_tko: cannot resolve pointer tile")
@@ -36,17 +41,23 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.load_ptr_tko), args)
     result_tile_type = tile_type!(tt, dtype, tile_shape)
     token_type = Token(tt)
 
-    # Check if mask is provided (arg 2 is not nothing)
-    has_mask = length(args) >= 2 && get_constant(ctx, args[2]) !== nothing
+    # Extract latency hint (args[2])
+    latency = get_constant(ctx, args[2])
+
+    # Create optimization hints if provided
+    optimization_hints = create_optimization_hints(ctx, latency)
+
+    # Check if mask is provided (arg 3 is not nothing)
+    has_mask = length(args) >= 3 && get_constant(ctx, args[3]) !== nothing
 
     if has_mask
-        # Get mask tile (arg 2)
-        mask_tv = emit_value!(ctx, args[2])
+        # Get mask tile (arg 3)
+        mask_tv = emit_value!(ctx, args[3])
         mask_tv === nothing && error("load_ptr_tko: cannot resolve mask tile")
         mask = mask_tv.v
 
-        # Get padding tile (arg 3)
-        padding_tv = emit_value!(ctx, args[3])
+        # Get padding tile (arg 4)
+        padding_tv = emit_value!(ctx, args[4])
         padding_tv === nothing && error("load_ptr_tko: cannot resolve padding tile")
         padding = padding_tv.v
 
@@ -54,11 +65,13 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.load_ptr_tko), args)
         tile_val, new_token = encode_LoadPtrTkoOp!(cb, result_tile_type, token_type, pointers;
                                                     mask=mask,
                                                     padding_value=padding,
-                                                    token=ctx.token)
+                                                    token=ctx.token,
+                                                    optimization_hints)
     else
         # Load without mask
         tile_val, new_token = encode_LoadPtrTkoOp!(cb, result_tile_type, token_type, pointers;
-                                                    token=ctx.token)
+                                                    token=ctx.token,
+                                                    optimization_hints)
     end
     ctx.token = new_token
 
@@ -71,15 +84,19 @@ end
 # cuda_tile.store_ptr_tko
 @eval Intrinsics begin
     """
-        store_ptr_tko(ptrs, values, mask=nothing)
+        store_ptr_tko(ptrs, values, latency, mask=nothing)
 
     Store values to a tile of pointers.
     If mask is provided, masked-out positions are not written.
     Compiled to cuda_tile.store_ptr_tko.
+
+    Note: TMA (allow_tma) is not applicable for pointer-based stores as they
+    support irregular access patterns incompatible with TMA requirements.
     """
     @noinline function store_ptr_tko(ptrs::Tile{Ptr{T}, S}, values::Tile{T, S},
+                                      latency::Union{Int, Nothing},
                                       mask::Union{Tile{Bool, S}, Nothing}=nothing) where {T, S}
-        donotdelete(ptrs, values, mask)
+        donotdelete(ptrs, values, latency, mask)
         nothing
     end
 end
@@ -87,6 +104,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.store_ptr_tko), args)
     cb = ctx.cb
     tt = ctx.tt
 
+    # args: (ptrs, values, latency, mask?)
     # Get pointer tile (arg 1)
     ptrs_tv = emit_value!(ctx, args[1])
     ptrs_tv === nothing && error("store_ptr_tko: cannot resolve pointer tile")
@@ -99,23 +117,31 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.store_ptr_tko), args)
 
     token_type = Token(tt)
 
-    # Check if mask is provided (arg 3 is not nothing)
-    has_mask = length(args) >= 3 && get_constant(ctx, args[3]) !== nothing
+    # Extract latency hint (args[3])
+    latency = get_constant(ctx, args[3])
+
+    # Create optimization hints if provided
+    optimization_hints = create_optimization_hints(ctx, latency)
+
+    # Check if mask is provided (arg 4 is not nothing)
+    has_mask = length(args) >= 4 && get_constant(ctx, args[4]) !== nothing
 
     if has_mask
-        # Get mask tile (arg 3)
-        mask_tv = emit_value!(ctx, args[3])
+        # Get mask tile (arg 4)
+        mask_tv = emit_value!(ctx, args[4])
         mask_tv === nothing && error("store_ptr_tko: cannot resolve mask tile")
         mask = mask_tv.v
 
         # Store with mask
         new_token = encode_StorePtrTkoOp!(cb, token_type, pointers, values;
                                            mask=mask,
-                                           token=ctx.token)
+                                           token=ctx.token,
+                                           optimization_hints)
     else
         # Store without mask
         new_token = encode_StorePtrTkoOp!(cb, token_type, pointers, values;
-                                           token=ctx.token)
+                                           token=ctx.token,
+                                           optimization_hints)
     end
     ctx.token = new_token
 
diff --git a/src/compiler/intrinsics/views.jl b/src/compiler/intrinsics/views.jl
index 2773a30..0690bc9 100644
--- a/src/compiler/intrinsics/views.jl
+++ b/src/compiler/intrinsics/views.jl
@@ -1,8 +1,5 @@
 # views
 
-"""
-Convert integer padding mode value to bytecode PaddingValue enum.
-"""
 function padding_mode_to_padding_value(mode::Int)
     mode == 0 ? PaddingMissing :
     mode == 1 ? PaddingZero :
@@ -83,13 +80,16 @@ end
 # cuda_tile.load_view_tko
 @eval Intrinsics begin
     """
-        load_partition_view(pv::PartitionView, index...) -> Tile
+        load_partition_view(pv::PartitionView, latency, allow_tma, index...) -> Tile
 
     Load a tile from a partition view at the given 0-indexed tile coordinates.
     Compiled to cuda_tile.load_view_tko.
     """
-    @noinline function load_partition_view(pv::PartitionView{T, N, Shape}, index::Vararg{Integer}) where {T, N, Shape}
-        donotdelete(pv)
+    @noinline function load_partition_view(pv::PartitionView{T, N, Shape},
+                                            latency::Union{Int, Nothing},
+                                            allow_tma::Bool,
+                                            index::Vararg{Integer}) where {T, N, Shape}
+        donotdelete(pv, latency, allow_tma)
         Tile{T, Shape}()
     end
 end
@@ -97,7 +97,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.load_partition_view), a
     cb = ctx.cb
     tt = ctx.tt
 
-    # args: (partition_view, indices...)
+    # args: (partition_view, latency, allow_tma, indices...)
     pv_arg = emit_value!(ctx, args[1])
     pv_arg === nothing && error("load_partition_view() requires a PartitionView argument")
     pv_arg.v === nothing && error("load_partition_view() requires a materialized PartitionView")
@@ -115,10 +115,21 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.load_partition_view), a
     tile_type = tile_type!(tt, dtype, tile_shape)
     token_type = Token(tt)
 
-    # Extract indices from args[2:end] and infer index type
+    # Extract optimization hints (args[2] = latency, args[3] = allow_tma)
+    latency = get_constant(ctx, args[2])
+    allow_tma = get_constant(ctx, args[3])
+
+    # Verify we got compile-time constants
+    if latency === nothing && allow_tma === nothing
+        error("load_partition_view(): latency and allow_tma must be compile-time constants")
+    end
+    # allow_tma defaults to true if not provided
+    allow_tma_val = allow_tma === nothing ? true : allow_tma::Bool
+
+    # Extract indices from args[4:end] and infer index type
     index_vals = Value[]
     index_jl_types = Type[]
-    for i in 2:length(args)
+    for i in 4:length(args)
         tv = emit_value!(ctx, args[i])
         tv === nothing && error("load_partition_view(): cannot resolve index argument")
         push!(index_vals, tv.v)
@@ -133,8 +144,12 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.load_partition_view), a
     # Pad indices if needed
     index_vals = pad_indices(ctx, index_vals, ndim, index_type, index_jl_type)
 
+    # Create optimization hints if provided
+    optimization_hints = create_optimization_hints(ctx, latency, allow_tma_val)
+
     # Load tile with token
-    tile_val, new_token = encode_LoadViewTkoOp!(cb, tile_type, token_type, pv_arg.v, index_vals; token=ctx.token)
+    tile_val, new_token = encode_LoadViewTkoOp!(cb, tile_type, token_type, pv_arg.v, index_vals;
+                                                 token=ctx.token, optimization_hints)
     ctx.token = new_token
 
     CGVal(tile_val, tile_type, Tile{elem_type, Tuple(tile_shape)}, tile_shape)
@@ -327,14 +342,17 @@ end
 # cuda_tile.store_view_tko
 @eval Intrinsics begin
     """
-        store_partition_view(pv::PartitionView, tile, index...) -> Nothing
+        store_partition_view(pv::PartitionView, tile, latency, allow_tma, index...) -> Nothing
 
     Store a tile to a partition view at the given 0-indexed tile coordinates.
     Compiled to cuda_tile.store_view_tko.
     """
-    @noinline function store_partition_view(pv::PartitionView{T, N, Shape}, tile::Tile{T, Shape}, index::Vararg{Integer}) where {T, N, Shape}
-        donotdelete(pv)
-        donotdelete(tile)
+    @noinline function store_partition_view(pv::PartitionView{T, N, Shape},
+                                             tile::Tile{T, Shape},
+                                             latency::Union{Int, Nothing},
+                                             allow_tma::Bool,
+                                             index::Vararg{Integer}) where {T, N, Shape}
+        donotdelete(pv, tile, latency, allow_tma)
         nothing
     end
 end
@@ -342,7 +360,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.store_partition_view),
     cb = ctx.cb
     tt = ctx.tt
 
-    # args: (partition_view, tile, indices...)
+    # args: (partition_view, tile, latency, allow_tma, indices...)
     pv_arg = emit_value!(ctx, args[1])
     pv_arg === nothing && error("store_partition_view() requires a PartitionView argument")
     pv_arg.v === nothing && error("store_partition_view() requires a materialized PartitionView")
@@ -371,10 +389,21 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.store_partition_view),
         tile_val = encode_ReshapeOp!(cb, tile_1d_type, tile_val)
     end
 
-    # Extract indices from args[3:end] and infer index type
+    # Extract optimization hints (args[3] = latency, args[4] = allow_tma)
+    latency = get_constant(ctx, args[3])
+    allow_tma = get_constant(ctx, args[4])
+
+    # Verify we got compile-time constants
+    if latency === nothing && allow_tma === nothing
+        error("store_partition_view(): latency and allow_tma must be compile-time constants")
+    end
+    # allow_tma defaults to true if not provided
+    allow_tma_val = allow_tma === nothing ? true : allow_tma::Bool
+
+    # Extract indices from args[5:end] and infer index type
     index_vals = Value[]
     index_jl_types = Type[]
-    for i in 3:length(args)
+    for i in 5:length(args)
         tv = emit_value!(ctx, args[i])
         tv === nothing && error("store_partition_view(): cannot resolve index argument")
         push!(index_vals, tv.v)
@@ -389,9 +418,13 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.store_partition_view),
     # Pad indices if needed
     index_vals = pad_indices(ctx, index_vals, actual_ndim, index_type, index_jl_type)
 
+    # Create optimization hints if provided
+    optimization_hints = create_optimization_hints(ctx, latency, allow_tma_val)
+
     # Store tile with token
     token_type = Token(tt)
-    new_token = encode_StoreViewTkoOp!(cb, token_type, tile_val, pv_arg.v, index_vals; token=ctx.token)
+    new_token = encode_StoreViewTkoOp!(cb, token_type, tile_val, pv_arg.v, index_vals;
+                                        token=ctx.token, optimization_hints)
     ctx.token = new_token
 
     nothing
diff --git a/src/compiler/target.jl b/src/compiler/target.jl
index cc569b7..af26ce7 100644
--- a/src/compiler/target.jl
+++ b/src/compiler/target.jl
@@ -150,9 +150,12 @@ mutable struct CGCtx
 
     # Type cache: Julia type -> TypeId
     type_cache::Dict{Type, TypeId}
+
+    # Target architecture (e.g., :sm_100)
+    sm_arch::Union{String, Nothing}
 end
 
-function CGCtx(writer::BytecodeWriter, target::TileTarget)
+function CGCtx(writer::BytecodeWriter, target::TileTarget, sm_arch::Union{String, Nothing}=nothing)
     CGCtx(
         Dict{Int, CGVal}(),
         Dict{Int, CGVal}(),
@@ -167,6 +170,7 @@ function CGCtx(writer::BytecodeWriter, target::TileTarget)
         nothing,
         nothing,
         Dict{Type, TypeId}(),
+        sm_arch,
     )
 end
 
diff --git a/src/language/operations.jl b/src/language/operations.jl
index bf20bb2..ace4b7a 100644
--- a/src/language/operations.jl
+++ b/src/language/operations.jl
@@ -60,7 +60,7 @@ Axis is 1-indexed. Equivalent to cld(arr.sizes[axis], shape[axis]).
 end
 
 """
-    load(arr::TileArray, index, shape; padding_mode=PaddingMode.Undetermined) -> Tile
+    load(arr::TileArray, index, shape; padding_mode=PaddingMode.Undetermined, latency=nothing, allow_tma=true) -> Tile
 
 Load a tile from a TileArray at the given index with the specified shape.
 Index is 1-indexed. Shape must be compile-time constant.
@@ -73,101 +73,131 @@ Index is 1-indexed. Shape must be compile-time constant.
 - `PaddingMode.PosInf`: Return positive infinity for OOB elements
 - `PaddingMode.NegInf`: Return negative infinity for OOB elements
 
+# Optimization Hints
+- `latency`: Optional latency hint (1-10), or nothing for compiler default
+- `allow_tma`: Whether TMA (Tensor Memory Accelerator) is allowed (default: true)
+
 # Example
 ```julia
-tile = ct.load(arr, (bid,), (TILE_N[],); padding_mode=ct.PaddingMode.Zero)
+tile = ct.load(arr, (bid,), (TILE_N[],); padding_mode=ct.PaddingMode.Zero, latency=3)
 ```
 """
 @inline function load(arr::TileArray{T, N}, index, shape::NTuple{<:Any, Int};
-                      padding_mode::Int=PaddingMode.Undetermined) where {T, N}
+                      padding_mode::Int=PaddingMode.Undetermined,
+                      latency::Union{Int, Nothing}=nothing,
+                      allow_tma::Bool=true) where {T, N}
     tv = Intrinsics.make_tensor_view(arr)
     pv = Intrinsics.make_partition_view(tv, Val(shape), padding_mode)
-    Intrinsics.load_partition_view(pv, (promote(index...) .- One())...)
+    Intrinsics.load_partition_view(pv, latency, allow_tma, (promote(index...) .- One())...)
 end
 
 @inline function load(arr::TileArray{T, N}, index::Integer, shape::NTuple{<:Any, Int};
-                      padding_mode::Int=PaddingMode.Undetermined) where {T, N}
+                      padding_mode::Int=PaddingMode.Undetermined,
+                      latency::Union{Int, Nothing}=nothing,
+                      allow_tma::Bool=true) where {T, N}
     tv = Intrinsics.make_tensor_view(arr)
     pv = Intrinsics.make_partition_view(tv, Val(shape), padding_mode)
-    Intrinsics.load_partition_view(pv, index - One())
+    Intrinsics.load_partition_view(pv, latency, allow_tma, index - One())
 end
 
 # Load with Constant shape tuple
 @inline function load(arr::TileArray{T, N}, index, shape::Tuple{Vararg{Constant{Int}}};
-                      padding_mode::Int=PaddingMode.Undetermined) where {T, N}
+                      padding_mode::Int=PaddingMode.Undetermined,
+                      latency::Union{Int, Nothing}=nothing,
+                      allow_tma::Bool=true) where {T, N}
     shape_val = _extract_shape(shape)
     tv = Intrinsics.make_tensor_view(arr)
     pv = Intrinsics.make_partition_view(tv, Val(shape_val), padding_mode)
-    Intrinsics.load_partition_view(pv, (promote(index...) .- One())...)
+    Intrinsics.load_partition_view(pv, latency, allow_tma, (promote(index...) .- One())...)
 end
 
 # Keyword argument version
 @inline function load(arr::TileArray{T, N}; index, shape,
-                      padding_mode::Int=PaddingMode.Undetermined) where {T, N}
+                      padding_mode::Int=PaddingMode.Undetermined,
+                      latency::Union{Int, Nothing}=nothing,
+                      allow_tma::Bool=true) where {T, N}
     shape_val = _extract_shape(shape)
     tv = Intrinsics.make_tensor_view(arr)
     pv = Intrinsics.make_partition_view(tv, Val(shape_val), padding_mode)
-    Intrinsics.load_partition_view(pv, (promote(index...) .- One())...)
+    Intrinsics.load_partition_view(pv, latency, allow_tma, (promote(index...) .- One())...)
 end
 
 """
-    store(arr::TileArray, index, tile::Tile) -> Tile
+    store(arr::TileArray, index, tile::Tile; latency=nothing, allow_tma=true) -> Tile
 
 Store a tile to a TileArray at the given index. Index is 1-indexed.
 Returns the stored tile (enables chaining and helps constant folding).
+
+# Optimization Hints
+- `latency`: Optional latency hint (1-10), or nothing for compiler default
+- `allow_tma`: Whether TMA (Tensor Memory Accelerator) is allowed (default: true)
 """
 # Regular N-D tiles (N >= 1)
-@inline function store(arr::TileArray{T}, index, tile::Tile{T, Shape}) where {T, Shape}
+@inline function store(arr::TileArray{T}, index, tile::Tile{T, Shape};
+                       latency::Union{Int, Nothing}=nothing,
+                       allow_tma::Bool=true) where {T, Shape}
     tv = Intrinsics.make_tensor_view(arr)
     pv = Intrinsics.make_partition_view(tv, Val(Shape), PaddingMode.Undetermined)
-    Intrinsics.store_partition_view(pv, tile, (promote(index...) .- One())...)
+    Intrinsics.store_partition_view(pv, tile, latency, allow_tma, (promote(index...) .- One())...)
     return tile  # XXX: enables constant folding; remove when possible (see "constant folding" test)
 end
 
-@inline function store(arr::TileArray{T}, index::Integer, tile::Tile{T, Shape}) where {T, Shape}
+@inline function store(arr::TileArray{T}, index::Integer, tile::Tile{T, Shape};
+                       latency::Union{Int, Nothing}=nothing,
+                       allow_tma::Bool=true) where {T, Shape}
     tv = Intrinsics.make_tensor_view(arr)
     pv = Intrinsics.make_partition_view(tv, Val(Shape), PaddingMode.Undetermined)
-    Intrinsics.store_partition_view(pv, tile, index - One())
+    Intrinsics.store_partition_view(pv, tile, latency, allow_tma, index - One())
     return tile  # XXX: enables constant folding; remove when possible (see "constant folding" test)
 end
 
 # Special case for 0D (scalar) tiles - reshape to 1D for partition view
-@inline function store(arr::TileArray{T}, index, tile::Tile{T, ()}) where {T}
+@inline function store(arr::TileArray{T}, index, tile::Tile{T, ()};
+                       latency::Union{Int, Nothing}=nothing,
+                       allow_tma::Bool=true) where {T}
     tv = Intrinsics.make_tensor_view(arr)
     # Reshape 0D tile to 1D (partition views require at least 1D)
     tile_1d = Intrinsics.reshape(tile, Val((1,)))
     pv = Intrinsics.make_partition_view(tv, Val((1,)), PaddingMode.Undetermined)
-    Intrinsics.store_partition_view(pv, tile_1d, (promote(index...) .- One())...)
+    Intrinsics.store_partition_view(pv, tile_1d, latency, allow_tma, (promote(index...) .- One())...)
     return tile  # XXX: enables constant folding; remove when possible (see "constant folding" test)
 end
 
-@inline function store(arr::TileArray{T}, index::Integer, tile::Tile{T, ()}) where {T}
+@inline function store(arr::TileArray{T}, index::Integer, tile::Tile{T, ()};
+                       latency::Union{Int, Nothing}=nothing,
+                       allow_tma::Bool=true) where {T}
     tv = Intrinsics.make_tensor_view(arr)
     tile_1d = Intrinsics.reshape(tile, Val((1,)))
     pv = Intrinsics.make_partition_view(tv, Val((1,)), PaddingMode.Undetermined)
-    Intrinsics.store_partition_view(pv, tile_1d, index - One())
+    Intrinsics.store_partition_view(pv, tile_1d, latency, allow_tma, index - One())
     return tile  # XXX: enables constant folding; remove when possible (see "constant folding" test)
 end
 
 # Keyword argument version - dispatch to positional version
-@inline function store(arr::TileArray{T}; index, tile::Tile{T, Shape}) where {T, Shape}
-    store(arr, index, tile)
+@inline function store(arr::TileArray{T}; index, tile::Tile{T, Shape},
+                       latency::Union{Int, Nothing}=nothing,
+                       allow_tma::Bool=true) where {T, Shape}
+    store(arr, index, tile; latency, allow_tma)
 end
 
 """
-    gather(array::TileArray{T, 1}, indices::Tile{I, S}) -> Tile{T, S}
+    gather(array::TileArray{T, 1}, indices::Tile{I, S}; latency=nothing) -> Tile{T, S}
 
 Gather elements from a 1D array using index tile.
 Indices are 1-indexed. Out-of-bounds indices return zero.
 
+# Optimization Hints
+- `latency`: Optional latency hint (1-10), or nothing for compiler default
+
 # Example
 ```julia
 base = (bid - 1) * TILE
 indices = base .+ ct.arange((TILE,), Int32)
-tile = ct.gather(arr, indices)
+tile = ct.gather(arr, indices; latency=3)
 ```
 """
-@inline function gather(array::TileArray{T, 1}, indices::Tile{I, S}) where {T, I <: Integer, S}
+@inline function gather(array::TileArray{T, 1}, indices::Tile{I, S};
+                        latency::Union{Int, Nothing}=nothing) where {T, I <: Integer, S}
     # Convert to 0-indexed
     indices_0 = indices .- one(I)
 
@@ -187,16 +217,20 @@ tile = ct.gather(arr, indices)
     # Padding for OOB (zero)
     padding = broadcast_to(Tile(zero(T)), S)
 
-    Intrinsics.load_ptr_tko(ptr_tile, mask, padding)
+    Intrinsics.load_ptr_tko(ptr_tile, latency, mask, padding)
 end
 
 """
-    gather(array::TileArray{T, 2}, indices::Tuple{Tile, Tile}) -> Tile{T, S}
+    gather(array::TileArray{T, 2}, indices::Tuple{Tile, Tile}; latency=nothing) -> Tile{T, S}
 
 Gather elements from a 2D array using a tuple of index tiles.
 Indices are 1-indexed. Index tiles are broadcast to a common shape.
+
+# Optimization Hints
+- `latency`: Optional latency hint (1-10), or nothing for compiler default
 """
-@inline function gather(array::TileArray{T, 2}, indices::Tuple{Tile{I0, S0}, Tile{I1, S1}}) where {T, I0 <: Integer, I1 <: Integer, S0, S1}
+@inline function gather(array::TileArray{T, 2}, indices::Tuple{Tile{I0, S0}, Tile{I1, S1}};
+                        latency::Union{Int, Nothing}=nothing) where {T, I0 <: Integer, I1 <: Integer, S0, S1}
     # Convert to 0-indexed
     idx0_0 = indices[1] .- one(I0)
     idx1_0 = indices[2] .- one(I1)
@@ -235,23 +269,27 @@ Indices are 1-indexed. Index tiles are broadcast to a common shape.
     # Padding for OOB (zero)
     padding = broadcast_to(Tile(zero(T)), S)
 
-    Intrinsics.load_ptr_tko(ptr_tile, mask, padding)
+    Intrinsics.load_ptr_tko(ptr_tile, latency, mask, padding)
 end
 
 """
-    scatter(array::TileArray{T, 1}, indices::Tile{I, S}, tile::Tile{T, S}) -> Nothing
+    scatter(array::TileArray{T, 1}, indices::Tile{I, S}, tile::Tile{T, S}; latency=nothing) -> Nothing
 
 Scatter elements to a 1D array at index tile positions.
 Indices are 1-indexed. Out-of-bounds indices are ignored.
 
+# Optimization Hints
+- `latency`: Optional latency hint (1-10), or nothing for compiler default
+
 # Example
 ```julia
 base = (bid - 1) * TILE
 indices = base .+ ct.arange((TILE,), Int32)
-ct.scatter(arr, indices, result_tile)
+ct.scatter(arr, indices, result_tile; latency=3)
 ```
 """
-@inline function scatter(array::TileArray{T, 1}, indices::Tile{I, S}, tile::Tile{T, S}) where {T, I <: Integer, S}
+@inline function scatter(array::TileArray{T, 1}, indices::Tile{I, S}, tile::Tile{T, S};
+                         latency::Union{Int, Nothing}=nothing) where {T, I <: Integer, S}
     # Convert to 0-indexed
     indices_0 = indices .- one(I)
 
@@ -268,16 +306,20 @@ ct.scatter(arr, indices, result_tile)
     lt_size = indices_i32 .< size_0d
     mask = ge_zero .& lt_size
 
-    Intrinsics.store_ptr_tko(ptr_tile, tile, mask)
+    Intrinsics.store_ptr_tko(ptr_tile, tile, latency, mask)
 end
 
 """
-    scatter(array::TileArray{T, 2}, indices::Tuple{Tile, Tile}, tile::Tile) -> Nothing
+    scatter(array::TileArray{T, 2}, indices::Tuple{Tile, Tile}, tile::Tile; latency=nothing) -> Nothing
 
 Scatter elements to a 2D array at index tile positions.
 Indices are 1-indexed. Index tiles and value tile must broadcast to same shape.
+
+# Optimization Hints
+- `latency`: Optional latency hint (1-10), or nothing for compiler default
 """
-@inline function scatter(array::TileArray{T, 2}, indices::Tuple{Tile{I0, S0}, Tile{I1, S1}}, tile::Tile{T, Stile}) where {T, I0 <: Integer, I1 <: Integer, S0, S1, Stile}
+@inline function scatter(array::TileArray{T, 2}, indices::Tuple{Tile{I0, S0}, Tile{I1, S1}}, tile::Tile{T, Stile};
+                         latency::Union{Int, Nothing}=nothing) where {T, I0 <: Integer, I1 <: Integer, S0, S1, Stile}
     # Convert to 0-indexed
     idx0_0 = indices[1] .- one(I0)
     idx1_0 = indices[2] .- one(I1)
@@ -314,7 +356,7 @@ Indices are 1-indexed. Index tiles and value tile must broadcast to same shape.
     mask1 = (idx1_i32 .>= zero_bc) .& (idx1_i32 .< size1_bc)
     mask = mask0 .& mask1
 
-    Intrinsics.store_ptr_tko(ptr_tile, tile_bc, mask)
+    Intrinsics.store_ptr_tko(ptr_tile, tile_bc, latency, mask)
 end
 
 #=============================================================================
diff --git a/test/codegen.jl b/test/codegen.jl
index 7e04e27..ae4b42e 100644
--- a/test/codegen.jl
+++ b/test/codegen.jl
@@ -1848,7 +1848,7 @@ end
 end
 
 #=============================================================================
- Entry Hints (optimization_hints attribute)
+ Entry Hints (kernel-level optimization hints)
 =============================================================================#
 
 @testset "Entry Hints" begin
@@ -1961,3 +1961,161 @@ end
         @test !isempty(bytecode32)
     end
 end
+
+#=============================================================================
+ Load / Store Hints (operation-level optimization hints)
+=============================================================================#
+
+@testset "Load / Store Optimization Hints" begin
+    # Common ArraySpecs for tests
+    spec1d = ct.ArraySpec{1}(16, true)
+
+    @testset "latency only on load" begin
+        @test @filecheck begin
+            @check "load_view_tko"
+            @check "optimization_hints = <sm_120 = {latency = 5}>"
+            ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a
+                pid = ct.bid(1)
+                t = ct.load(a, pid, (16,); latency=5)
+                ct.store(a, pid, t)
+                return nothing
+            end
+        end
+    end
+
+    @testset "allow_tma=false only on load" begin
+        @test @filecheck begin
+            @check "load_view_tko"
+            @check "optimization_hints = <sm_120 = {allow_tma = false}>"
+            ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a
+                pid = ct.bid(1)
+                t = ct.load(a, pid, (16,); allow_tma=false)
+                ct.store(a, pid, t)
+                return nothing
+            end
+        end
+    end
+
+    @testset "both hints on load" begin
+        @test @filecheck begin
+            @check "load_view_tko"
+            @check "optimization_hints = <sm_120 = {allow_tma = false, latency = 7}>"
+            ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a
+                pid = ct.bid(1)
+                t = ct.load(a, pid, (16,); latency=7, allow_tma=false)
+                ct.store(a, pid, t)
+                return nothing
+            end
+        end
+    end
+
+    @testset "latency only on store" begin
+        @test @filecheck begin
+            @check "store_view_tko"
+            @check "optimization_hints = <sm_120 = {latency = 3}>"
+            ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a
+                pid = ct.bid(1)
+                t = ct.load(a, pid, (16,))
+                ct.store(a, pid, t; latency=3)
+                return nothing
+            end
+        end
+    end
+
+    @testset "allow_tma=false only on store" begin
+        @test @filecheck begin
+            @check "store_view_tko"
+            @check "optimization_hints = <sm_120 = {allow_tma = false}>"
+            ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a
+                pid = ct.bid(1)
+                t = ct.load(a, pid, (16,))
+                ct.store(a, pid, t; allow_tma=false)
+                return nothing
+            end
+        end
+    end
+
+    @testset "both hints on store" begin
+        @test @filecheck begin
+            @check "store_view_tko"
+            @check "optimization_hints = <sm_120 = {allow_tma = false, latency = 2}>"
+            ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a
+                pid = ct.bid(1)
+                t = ct.load(a, pid, (16,))
+                ct.store(a, pid, t; allow_tma=false, latency=2)
+                return nothing
+            end
+        end
+    end
+
+    @testset "latency validation" begin
+        @test_throws "latency must be between 1 and 10" begin
+            code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a
+                pid = ct.bid(1)
+                ct.load(a, pid, (16,); latency=11)
+            end
+        end
+
+        bytecode1 = code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a
+            pid = ct.bid(1)
+            t = ct.load(a, pid, (16,); latency=8)
+            ct.store(a, pid, t)
+            return nothing
+        end
+        @test !isempty(bytecode1)
+    end
+
+    @testset "multiple operations with mixed hints" begin
+        @test @filecheck begin
+            # First load with latency
+            @check "load_view_tko"
+            @check "optimization_hints = <sm_120 = {latency = 5}>"
+            # Second load with allow_tma=false
+            @check "load_view_tko"
+            @check "optimization_hints = <sm_120 = {allow_tma = false}>"
+            # Third load with no hints
+            @check "load_view_tko"
+            @check_not "optimization_hints"
+            ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d},
+                               ct.TileArray{Float32, 1, spec1d},
+                               ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a, b, c
+                pid = ct.bid(1)
+                t1 = ct.load(a, pid, (16,); latency=5)
+                t2 = ct.load(b, pid, (16,); allow_tma=false)
+                t3 = ct.load(c, pid, (16,))
+                result = t1 + t2 + t3
+                ct.store(a, pid, result)
+                return nothing
+            end
+        end
+    end
+
+    # Pointer-based operations (gather/scatter) with latency hints
+    @testset "gather with latency hint" begin
+        @test @filecheck begin
+            @check "load_ptr_tko"
+            @check "optimization_hints = <sm_120 = {latency = 3}>"
+            code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}, ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a, b
+                pid = ct.bid(1)
+                indices = ct.arange((16,), Int32)
+                tile = ct.gather(a, indices; latency=3)
+                ct.store(b, pid, tile)
+                return nothing
+            end
+        end
+    end
+
+    @testset "scatter with latency hint" begin
+        @test @filecheck begin
+            @check "store_ptr_tko"
+            @check "optimization_hints = <sm_120 = {latency = 5}>"
+            code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}, ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a, b
+                pid = ct.bid(1)
+                tile = ct.load(a, pid, (16,))
+                indices = ct.arange((16,), Int32)
+                ct.scatter(b, indices, tile; latency=5)
+                return nothing
+            end
+        end
+    end
+end
diff --git a/test/execution.jl b/test/execution.jl
index 44e5114..8297a9d 100644
--- a/test/execution.jl
+++ b/test/execution.jl
@@ -1590,7 +1590,7 @@ end
 
 end
 
-@testset "Entry Hints Integration" begin
+@testset "Entry Hints" begin
 
 @testset "launch with num_ctas" begin
     function vadd_kernel_num_ctas(a::ct.TileArray{Float32,1},
@@ -1656,3 +1656,346 @@ end
 end
 
 end
+
+@testset "Load / Store Optimization Hints" begin
+
+@testset "load with latency hint" begin
+    function vadd_with_load_latency(a::ct.TileArray{Float32,1},
+                                    b::ct.TileArray{Float32,1},
+                                    c::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        ta = ct.load(a, pid, (16,); latency=5)
+        tb = ct.load(b, pid, (16,); latency=3)
+        ct.store(c, pid, ta + tb)
+        return nothing
+    end
+
+    n = 1024
+    a = CUDA.ones(Float32, n)
+    b = CUDA.ones(Float32, n) .* 2
+    c = CUDA.zeros(Float32, n)
+
+    ct.launch(vadd_with_load_latency, 64, a, b, c)
+    CUDA.synchronize()
+    @test Array(c) ≈ ones(Float32, n) .* 3
+end
+
+@testset "load with allow_tma=false" begin
+    function vadd_no_tma(a::ct.TileArray{Float32,1},
+                         b::ct.TileArray{Float32,1},
+                         c::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        ta = ct.load(a, pid, (16,); allow_tma=false)
+        tb = ct.load(b, pid, (16,); allow_tma=false)
+        ct.store(c, pid, ta + tb)
+        return nothing
+    end
+
+    n = 1024
+    a = CUDA.ones(Float32, n)
+    b = CUDA.ones(Float32, n) .* 2
+    c = CUDA.zeros(Float32, n)
+
+    ct.launch(vadd_no_tma, 64, a, b, c)
+    CUDA.synchronize()
+    @test Array(c) ≈ ones(Float32, n) .* 3
+end
+
+@testset "load with both hints" begin
+    function vadd_both_load_hints(a::ct.TileArray{Float32,1},
+                                  b::ct.TileArray{Float32,1},
+                                  c::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        ta = ct.load(a, pid, (16,); latency=7, allow_tma=false)
+        tb = ct.load(b, pid, (16,); latency=4, allow_tma=true)
+        ct.store(c, pid, ta + tb)
+        return nothing
+    end
+
+    n = 1024
+    a = CUDA.ones(Float32, n)
+    b = CUDA.ones(Float32, n) .* 2
+    c = CUDA.zeros(Float32, n)
+
+    ct.launch(vadd_both_load_hints, 64, a, b, c)
+    CUDA.synchronize()
+    @test Array(c) ≈ ones(Float32, n) .* 3
+end
+
+@testset "store with latency hint" begin
+    function copy_with_store_latency(a::ct.TileArray{Float32,1},
+                                     b::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        ta = ct.load(a, pid, (16,))
+        ct.store(b, pid, ta; latency=2)
+        return nothing
+    end
+
+    n = 1024
+    a = CUDA.rand(Float32, n)
+    b = CUDA.zeros(Float32, n)
+
+    ct.launch(copy_with_store_latency, 64, a, b)
+    CUDA.synchronize()
+    @test Array(b) ≈ Array(a)
+end
+
+@testset "store with allow_tma=false" begin
+    function copy_no_tma_store(a::ct.TileArray{Float32,1},
+                               b::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        ta = ct.load(a, pid, (16,))
+        ct.store(b, pid, ta; allow_tma=false)
+        return nothing
+    end
+
+    n = 1024
+    a = CUDA.rand(Float32, n)
+    b = CUDA.zeros(Float32, n)
+
+    ct.launch(copy_no_tma_store, 64, a, b)
+    CUDA.synchronize()
+    @test Array(b) ≈ Array(a)
+end
+
+@testset "different hints on load and store" begin
+    function vadd_mixed_hints(a::ct.TileArray{Float32,1},
+                              b::ct.TileArray{Float32,1},
+                              c::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        # Load with high latency, no TMA
+        ta = ct.load(a, pid, (16,); latency=8, allow_tma=false)
+        tb = ct.load(b, pid, (16,); latency=6, allow_tma=false)
+        # Store with low latency, allow TMA
+        ct.store(c, pid, ta + tb; latency=2, allow_tma=true)
+        return nothing
+    end
+
+    n = 1024
+    a = CUDA.ones(Float32, n)
+    b = CUDA.ones(Float32, n) .* 2
+    c = CUDA.zeros(Float32, n)
+
+    ct.launch(vadd_mixed_hints, 64, a, b, c)
+    CUDA.synchronize()
+    @test Array(c) ≈ ones(Float32, n) .* 3
+end
+
+@testset "2D matmul with hints" begin
+    function matmul_with_hints(a::ct.TileArray{Float32,2},
+                               b::ct.TileArray{Float32,2},
+                               c::ct.TileArray{Float32,2})
+        bidx = ct.bid(1)
+        bidy = ct.bid(2)
+        # Load with latency hints
+        tile_a = ct.load(a, (bidx, 1), (32, 16); latency=5)
+        tile_b = ct.load(b, (1, bidy), (16, 32); latency=5)
+        result = tile_a * tile_b
+        # Store with latency hint
+        ct.store(c, (bidx, bidy), result; latency=3)
+        return nothing
+    end
+
+    M, K, N = 64, 16, 64
+    a = CUDA.rand(Float32, M, K)
+    b = CUDA.rand(Float32, K, N)
+    c = CUDA.zeros(Float32, M, N)
+
+    grid_x = cld(M, 32)
+    grid_y = cld(N, 32)
+    ct.launch(matmul_with_hints, (grid_x, grid_y, 1), a, b, c)
+    CUDA.synchronize()
+
+    # Verify against CPU reference
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    c_cpu = Array(c)
+    c_ref = a_cpu * b_cpu
+
+    @test c_cpu ≈ c_ref rtol=1e-5
+end
+
+@testset "reduction with hints" begin
+    function reduce_with_hints(a::ct.TileArray{Float32,2},
+                               b::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        # Load with hints
+        tile = ct.load(a, (pid, 1), (1, 128); latency=6, allow_tma=false)
+        sums = ct.reduce_sum(tile, 2)
+        # Store with hints
+        ct.store(b, pid, sums; latency=2)
+        return nothing
+    end
+
+    m, n = 64, 128
+    a = CUDA.rand(Float32, m, n)
+    b = CUDA.zeros(Float32, m)
+
+    ct.launch(reduce_with_hints, m, a, b)
+    CUDA.synchronize()
+
+    # Each row should be summed
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] ≈ sum(a_cpu[i, :]) rtol=1e-3
+    end
+end
+
+@testset "transpose with hints" begin
+    function transpose_with_hints(x::ct.TileArray{Float32,2},
+                                  y::ct.TileArray{Float32,2})
+        bidx = ct.bid(1)
+        bidy = ct.bid(2)
+        # Load with high latency
+        tile = ct.load(x, (bidx, bidy), (32, 32); latency=9)
+        transposed = ct.transpose(tile)
+        # Store with lower latency
+        ct.store(y, (bidy, bidx), transposed; latency=4)
+        return nothing
+    end
+
+    m, n = 256, 128
+    tile_size = 32
+    x = CUDA.rand(Float32, m, n)
+    y = CUDA.zeros(Float32, n, m)
+
+    ct.launch(transpose_with_hints, (cld(m, tile_size), cld(n, tile_size)), x, y)
+    CUDA.synchronize()
+
+    @test Array(y) ≈ transpose(Array(x))
+end
+
+@testset "complex kernel with multiple loads/stores with hints" begin
+    function complex_hints_kernel(a::ct.TileArray{Float32,1},
+                                  b::ct.TileArray{Float32,1},
+                                  c::ct.TileArray{Float32,1},
+                                  d::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        # Multiple loads with different hints
+        ta = ct.load(a, pid, (16,); latency=10, allow_tma=false)
+        tb = ct.load(b, pid, (16,); latency=5, allow_tma=true)
+        tc = ct.load(c, pid, (16,); latency=7)
+
+        # Compute result
+        result = ta + tb + tc
+
+        # Store with hint
+        ct.store(d, pid, result; latency=1, allow_tma=false)
+        return nothing
+    end
+
+    n = 1024
+    a = CUDA.ones(Float32, n)
+    b = CUDA.ones(Float32, n) .* 2
+    c = CUDA.ones(Float32, n) .* 3
+    d = CUDA.zeros(Float32, n)
+
+    ct.launch(complex_hints_kernel, 64, a, b, c, d)
+    CUDA.synchronize()
+    @test Array(d) ≈ ones(Float32, n) .* 6
+end
+
+@testset "hints with Float64" begin
+    function vadd_f64_hints(a::ct.TileArray{Float64,1},
+                            b::ct.TileArray{Float64,1},
+                            c::ct.TileArray{Float64,1})
+        pid = ct.bid(1)
+        ta = ct.load(a, pid, (16,); latency=8)
+        tb = ct.load(b, pid, (16,); latency=8)
+        ct.store(c, pid, ta + tb; latency=4)
+        return nothing
+    end
+
+    n = 1024
+    a = CUDA.rand(Float64, n)
+    b = CUDA.rand(Float64, n)
+    c = CUDA.zeros(Float64, n)
+
+    ct.launch(vadd_f64_hints, 64, a, b, c)
+    CUDA.synchronize()
+    @test Array(c) ≈ Array(a) + Array(b)
+end
+
+@testset "hints with Float16" begin
+    function vadd_f16_hints(a::ct.TileArray{Float16,1},
+                            b::ct.TileArray{Float16,1},
+                            c::ct.TileArray{Float16,1})
+        pid = ct.bid(1)
+        ta = ct.load(a, pid, (16,); latency=3, allow_tma=false)
+        tb = ct.load(b, pid, (16,); latency=3, allow_tma=false)
+        ct.store(c, pid, ta + tb; latency=1)
+        return nothing
+    end
+
+    n = 1024
+    a = CUDA.rand(Float16, n)
+    b = CUDA.rand(Float16, n)
+    c = CUDA.zeros(Float16, n)
+
+    ct.launch(vadd_f16_hints, 64, a, b, c)
+    CUDA.synchronize()
+    @test Array(c) ≈ Array(a) + Array(b)
+end
+
+@testset "boundary latency values" begin
+    function test_boundary_latency(a::ct.TileArray{Float32,1},
+                                   b::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        # Min and max valid latency values
+        ta = ct.load(a, pid, (16,); latency=1)
+        ct.store(b, pid, ta; latency=10)
+        return nothing
+    end
+
+    n = 1024
+    a = CUDA.rand(Float32, n)
+    b = CUDA.zeros(Float32, n)
+
+    ct.launch(test_boundary_latency, 64, a, b)
+    CUDA.synchronize()
+    @test Array(b) ≈ Array(a)
+end
+
+# Pointer-based operations (gather/scatter) with latency hints
+@testset "gather with latency hint" begin
+    function gather_with_latency(a::ct.TileArray{Float32,1},
+                                 b::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        base = (pid - 1) * 16
+        indices = base .+ ct.arange((16,), Int32)
+        tile = ct.gather(a, indices; latency=5)
+        ct.store(b, pid, tile)
+        return nothing
+    end
+
+    n = 1024
+    a = CUDA.rand(Float32, n)
+    b = CUDA.zeros(Float32, n)
+
+    ct.launch(gather_with_latency, 64, a, b)
+    CUDA.synchronize()
+    @test Array(b) ≈ Array(a)
+end
+
+@testset "scatter with latency hint" begin
+    function scatter_with_latency(a::ct.TileArray{Float32,1},
+                                  b::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, pid, (16,))
+        base = (pid - 1) * 16
+        indices = base .+ ct.arange((16,), Int32)
+        ct.scatter(b, indices, tile; latency=3)
+        return nothing
+    end
+
+    n = 1024
+    a = CUDA.rand(Float32, n)
+    b = CUDA.zeros(Float32, n)
+
+    ct.launch(scatter_with_latency, 64, a, b)
+    CUDA.synchronize()
+    @test Array(b) ≈ Array(a)
+end
+
+end

From 24a25bb97daa69250a986f13e38cb04f08e7b098 Mon Sep 17 00:00:00 2001
From: AntonOresten <antonoresten@proton.me>
Date: Wed, 14 Jan 2026 09:28:17 +0100
Subject: [PATCH 2/2] docstring and comment fixes

---
 src/bytecode/writer.jl           | 2 +-
 src/compiler/intrinsics/views.jl | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index 67c3df1..eb87585 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -544,7 +544,7 @@ function finalize_function!(func_buf::Vector{UInt8}, cb::CodeBuilder,
 end
 
 #=============================================================================
- Optimization and Entry Hints 
+ Optimization Hints 
 =============================================================================#
 
 """
diff --git a/src/compiler/intrinsics/views.jl b/src/compiler/intrinsics/views.jl
index 0690bc9..47986a9 100644
--- a/src/compiler/intrinsics/views.jl
+++ b/src/compiler/intrinsics/views.jl
@@ -1,5 +1,8 @@
 # views
 
+"""
+Convert integer padding mode value to bytecode PaddingValue enum.
+"""
 function padding_mode_to_padding_value(mode::Int)
     mode == 0 ? PaddingMissing :
     mode == 1 ? PaddingZero :