From e6ba9731616dc03d723326447ca0cf414e1711a7 Mon Sep 17 00:00:00 2001 From: arhik Date: Sat, 17 Jan 2026 10:51:48 +0000 Subject: [PATCH 1/8] feat: Add integer reduction support for reduce_sum and reduce_max MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit enables reduce_sum and reduce_max operations on all numeric types, extending beyond the previous float-only support. ## Infrastructure Changes ### Bytecode Layer - Added IntegerIdentityOp struct with signed/unsigned handling - Added encode_tagged_int! for integer identity encoding - Added mask_to_width function with zigzag encoding for signed types - Added encode_identity! dispatch for FloatIdentityOp and IntegerIdentityOp - Refactored ReduceIdentity → IdentityOp for extensibility ### Compiler Layer - Refactored emit_reduce! to use dispatch-based approach - Added operation_identity dispatch for add/max operations - Added encode_reduce_body dispatch for float and integer operations - Removed T <: AbstractFloat constraints from intrinsics ### Language Layer - Removed type constraints from reduce_sum and reduce_max in operations.jl ## Test Coverage ### Codegen Tests - Added FileCheck tests for Int32/UInt32 reduce_sum and reduce_max - Verifies correct IR generation (addi, maxi instructions) ### Execution Tests - Factory pattern for easy extension (makeReduceKernel, cpu_reduce) - Tests 10 types: Int8, Int16, Int32, Int64, UInt16, UInt32, UInt64, Float16, Float32, Float64 - Tests 2 operations: reduce_sum, reduce_max - CPU verification for all test cases - Type-appropriate input ranges to prevent overflow ## Files Changed - src/bytecode/encodings.jl: Fix IdentityOp type annotation - src/bytecode/writer.jl: Integer identity infrastructure - src/compiler/intrinsics.jl: Import identity types - src/compiler/intrinsics/core.jl: Dispatch-based reduce implementation - src/cuTile.jl: Export identity types - src/language/operations.jl: Remove type constraints - test/codegen.jl: Add integer reduction codegen tests - test/execution.jl: Add extendable execution tests ## Extensibility The infrastructure is designed for easy extension: - Add new reduce operations by defining operation_identity and encode_reduce_body methods - Add new types by adding to TEST_TYPES array and appropriate data generation --- src/bytecode/encodings.jl | 2 +- src/bytecode/writer.jl | 90 ++++++++++++++++++++++++---- src/compiler/intrinsics.jl | 1 + src/compiler/intrinsics/core.jl | 72 +++++++++++++++++++---- src/cuTile.jl | 3 + src/language/operations.jl | 8 +-- test/codegen.jl | 56 ++++++++++++++++++ test/execution.jl | 101 ++++++++++++++++++++++++++++++++ 8 files changed, 304 insertions(+), 29 deletions(-) diff --git a/src/bytecode/encodings.jl b/src/bytecode/encodings.jl index 9f06415..9d20820 100644 --- a/src/bytecode/encodings.jl +++ b/src/bytecode/encodings.jl @@ -1291,7 +1291,7 @@ function encode_ReduceOp!(body::Function, cb::CodeBuilder, result_types::Vector{TypeId}, operands::Vector{Value}, dim::Int, - identities::Vector{<:ReduceIdentity}, + identities::Vector{<:IdentityOp}, body_scalar_types::Vector{TypeId}) encode_varint!(cb.buf, Opcode.ReduceOp) diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl index eb87585..52b693e 100644 --- a/src/bytecode/writer.jl +++ b/src/bytecode/writer.jl @@ -234,30 +234,42 @@ end =============================================================================# """ - ReduceIdentity + IdentityOp -Abstract type for reduce identity attributes. +Abstract type for binary operation identity attributes (reduce, scan, etc.). """ -abstract type ReduceIdentity end +abstract type IdentityOp end """ - FloatIdentity(value, type_id, dtype) + FloatIdentityOp(value, type_id, dtype) -Float identity value for reduce operations. +Float identity value for binary operations. """ -struct FloatIdentity <: ReduceIdentity +struct FloatIdentityOp <: IdentityOp value::Float64 type_id::TypeId dtype::Type # Float16, Float32, Float64, etc. end """ - encode_tagged_float!(cb, identity::FloatIdentity) + IntegerIdentityOp(value, type_id, dtype, signed) + +Integer identity value for binary operations. +""" +struct IntegerIdentityOp <: IdentityOp + value::UInt128 # Store as UInt128 to handle all unsigned values up to 64 bits + type_id::TypeId + dtype::Type # Int8, Int16, Int32, Int64, UInt8, etc. + signed::Bool # true for signed, false for unsigned +end + +""" + encode_tagged_float!(cb, identity::FloatIdentityOp) Encode a tagged float attribute for reduce identity. Format: tag(Float=0x02) + typeid + ap_int(value_bits) """ -function encode_tagged_float!(cb::CodeBuilder, identity::FloatIdentity) +function encode_tagged_float!(cb::CodeBuilder, identity::FloatIdentityOp) # Tag for Float attribute push!(cb.buf, 0x02) # Type ID @@ -267,6 +279,53 @@ function encode_tagged_float!(cb::CodeBuilder, identity::FloatIdentity) encode_signed_varint!(cb.buf, bits) end +""" + encode_tagged_int!(cb, identity::IntegerIdentityOp) + +Encode a tagged integer identity attribute. +Format: tag(Int=0x01) + typeid + ap_int(value) +""" +function encode_tagged_int!(cb::CodeBuilder, identity::IntegerIdentityOp) + # Tag for Int attribute + push!(cb.buf, 0x01) + # Type ID + encode_typeid!(cb.buf, identity.type_id) + # Value: signed uses zigzag varint, unsigned uses plain varint + # Mask value to correct bit width and apply zigzag if signed + masked_value = mask_to_width(identity.value, identity.dtype, identity.signed) + if identity.signed + encode_signed_varint!(cb.buf, masked_value) + else + encode_varint!(cb.buf, masked_value) + end +end + +""" + mask_to_width(value, dtype, signed) + +Mask a UInt128 value to the correct bit width for the given type and apply zigzag if signed. +""" +mask_to_width(value::UInt128, ::Type{Int64}, signed::Bool) = + let masked = UInt64(value & 0xFFFFFFFFFFFFFFFF) + UInt64((masked << 1) ⊻ (masked >>> 63)) + end +mask_to_width(value::UInt128, ::Type{Int32}, signed::Bool) = + let masked = UInt32(value & 0xFFFFFFFF) + UInt32((masked << 1) ⊻ (masked >>> 31)) + end +mask_to_width(value::UInt128, ::Type{Int16}, signed::Bool) = + let masked = UInt16(value & 0xFFFF) + UInt16((masked << 1) ⊻ (masked >>> 15)) + end +mask_to_width(value::UInt128, ::Type{Int8}, signed::Bool) = + let masked = UInt8(value & 0xFF) + UInt8((masked << 1) ⊻ (masked >>> 7)) + end +mask_to_width(value::UInt128, ::Type{UInt64}, signed::Bool) = UInt64(value & 0xFFFFFFFFFFFFFFFF) +mask_to_width(value::UInt128, ::Type{UInt32}, signed::Bool) = UInt32(value & 0xFFFFFFFF) +mask_to_width(value::UInt128, ::Type{UInt16}, signed::Bool) = UInt16(value & 0xFFFF) +mask_to_width(value::UInt128, ::Type{UInt8}, signed::Bool) = UInt8(value & 0xFF) + """ float_to_bits(value, dtype) @@ -304,15 +363,24 @@ end """ encode_identity_array!(cb, identities) -Encode an array of reduce identity attributes. +Encode an array of binary operation identity attributes. +Dispatches on identity type to encode correctly. """ -function encode_identity_array!(cb::CodeBuilder, identities::Vector{<:ReduceIdentity}) +function encode_identity_array!(cb::CodeBuilder, identities::Vector{<:IdentityOp}) encode_varint!(cb.buf, length(identities)) for identity in identities - encode_tagged_float!(cb, identity) + encode_identity!(cb, identity) end end +""" + encode_identity!(cb, identity) + +Encode a single identity attribute, dispatching on type. +""" +encode_identity!(cb::CodeBuilder, identity::FloatIdentityOp) = encode_tagged_float!(cb, identity) +encode_identity!(cb::CodeBuilder, identity::IntegerIdentityOp) = encode_tagged_int!(cb, identity) + """ BytecodeWriter diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl index 16c55da..e522141 100644 --- a/src/compiler/intrinsics.jl +++ b/src/compiler/intrinsics.jl @@ -8,6 +8,7 @@ using Base: compilerbarrier, donotdelete using ..cuTile: Tile, TileArray, Constant, TensorView, PartitionView using ..cuTile: Signedness, SignednessSigned, SignednessUnsigned using ..cuTile: ComparisonPredicate, CmpLessThan, CmpLessThanOrEqual, CmpGreaterThan, CmpGreaterThanOrEqual, CmpEqual, CmpNotEqual +using ..cuTile: IdentityOp, FloatIdentityOp, IntegerIdentityOp end diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl index 7fb2530..61a4a0b 100644 --- a/src/compiler/intrinsics/core.jl +++ b/src/compiler/intrinsics/core.jl @@ -512,7 +512,7 @@ end Sum reduction along 0-indexed axis. Compiled to cuda_tile.reduce with ADD. """ - @noinline function reduce_sum(tile::Tile{T, S}, ::Val{axis}) where {T <: AbstractFloat, S, axis} + @noinline function reduce_sum(tile::Tile{T, S}, ::Val{axis}) where {T, S, axis} reduced_shape = ntuple(i -> S[i < axis + 1 ? i : i + 1], length(S) - 1) Tile{T, reduced_shape}() end @@ -523,7 +523,7 @@ end Maximum reduction along 0-indexed axis. Compiled to cuda_tile.reduce with MAX. """ - @noinline function reduce_max(tile::Tile{T, S}, ::Val{axis}) where {T <: AbstractFloat, S, axis} + @noinline function reduce_max(tile::Tile{T, S}, ::Val{axis}) where {T, S, axis} reduced_shape = ntuple(i -> S[i < axis + 1 ? i : i + 1], length(S) - 1) Tile{T, reduced_shape}() end @@ -562,28 +562,74 @@ function emit_reduce!(ctx::CGCtx, args, reduce_fn::Symbol) # Scalar type for reduction body (0D tile) scalar_tile_type = tile_type!(tt, dtype, Int[]) - # Create identity value - use simple dtype (f32), not tile type - identity_val = reduce_fn == :add ? -0.0 : (reduce_fn == :max ? -Inf : 0.0) - identity = FloatIdentity(identity_val, dtype, elem_type) + # Create identity value via dispatch on reduction function and element type + identity = operation_identity(Val(reduce_fn), dtype, elem_type) # Emit ReduceOp results = encode_ReduceOp!(cb, [output_tile_type], [input_tv.v], axis, [identity], [scalar_tile_type]) do block_args acc, elem = block_args[1], block_args[2] - if reduce_fn == :add - res = encode_AddFOp!(cb, scalar_tile_type, acc, elem) - elseif reduce_fn == :max - res = encode_MaxFOp!(cb, scalar_tile_type, acc, elem) - else - error("Unsupported reduction function: $reduce_fn") - end - + res = encode_reduce_body(cb, scalar_tile_type, acc, elem, Val(reduce_fn), elem_type) encode_YieldOp!(cb, [res]) end CGVal(results[1], output_tile_type, Tile{elem_type, Tuple(output_shape)}, output_shape) end +#=============================================================================# +# Reduce Identity Values via Dispatch +#=============================================================================# + +""" + operation_identity(fn, dtype, elem_type) -> IdentityOp + to_uint128(value) + +Convert an integer value to UInt128 for storage in IntegerIdentityOp. +For signed types, this returns the two's complement bit representation. +""" +# Unsigned types: directly convert +to_uint128(value::UInt64) = UInt128(value) +to_uint128(value::UInt32) = UInt128(value) +to_uint128(value::UInt16) = UInt128(value) +to_uint128(value::UInt8) = UInt128(value) +# Signed types: reinterpret as unsigned first, then convert +to_uint128(value::Int64) = UInt128(reinterpret(UInt64, value)) +to_uint128(value::Int32) = UInt128(reinterpret(UInt32, value)) +to_uint128(value::Int16) = UInt128(reinterpret(UInt16, value)) +to_uint128(value::Int8) = UInt128(reinterpret(UInt8, value)) + +""" + operation_identity(fn, dtype, elem_type) -> IdentityOp + +Return the identity value for a binary operation (reduce, scan, etc.). +Identity must satisfy: identity ⊕ x = x for the operation. +""" + +# Addition identity: 0 + x = x +operation_identity(::Val{:add}, dtype, ::Type{T}) where T <: AbstractFloat = + FloatIdentityOp(zero(T), dtype, T) +operation_identity(::Val{:add}, dtype, ::Type{T}) where T <: Integer = + IntegerIdentityOp(to_uint128(zero(T)), dtype, T, T <: Signed) + +# Maximum identity: max(typemin(T), x) = x +operation_identity(::Val{:max}, dtype, ::Type{T}) where T <: AbstractFloat = + FloatIdentityOp(typemin(T), dtype, T) +operation_identity(::Val{:max}, dtype, ::Type{T}) where T <: Integer = + IntegerIdentityOp(to_uint128(typemin(T)), dtype, T, T <: Signed) + +#=============================================================================# +# Reduce Body Operations - dispatch on Val{fn} and elem_type +#=============================================================================# + +encode_reduce_body(cb, type, acc, elem, ::Val{:add}, ::Type{T}) where T <: AbstractFloat = + encode_AddFOp!(cb, type, acc, elem) +encode_reduce_body(cb, type, acc, elem, ::Val{:max}, ::Type{T}) where T <: AbstractFloat = + encode_MaxFOp!(cb, type, acc, elem) +encode_reduce_body(cb, type, acc, elem, ::Val{:add}, ::Type{T}) where T <: Integer = + encode_AddIOp!(cb, type, acc, elem) +encode_reduce_body(cb, type, acc, elem, ::Val{:max}, ::Type{T}) where T <: Integer = + encode_MaxIOp!(cb, type, acc, elem; signedness=T <: Signed ? SignednessSigned : SignednessUnsigned) + # cuda_tile.reshape @eval Intrinsics begin diff --git a/src/cuTile.jl b/src/cuTile.jl index 375aaa2..a3b019d 100644 --- a/src/cuTile.jl +++ b/src/cuTile.jl @@ -40,4 +40,7 @@ include("language/atomics.jl") public launch launch() = error("Please import CUDA.jl before using `cuTile.launch`.") +# Export identity types for reduction operations +public IdentityOp, FloatIdentityOp, IntegerIdentityOp + end # module cuTile diff --git a/src/language/operations.jl b/src/language/operations.jl index 463a358..5b00350 100644 --- a/src/language/operations.jl +++ b/src/language/operations.jl @@ -529,10 +529,10 @@ Returns a tile with the specified dimension removed. sums = ct.reduce_sum(tile, 2) # Returns (128,) tile ``` """ -@inline function reduce_sum(tile::Tile{T, S}, axis::Integer) where {T <: AbstractFloat, S} +@inline function reduce_sum(tile::Tile{T, S}, axis::Integer) where {T, S} Intrinsics.reduce_sum(tile, Val(axis - 1)) end -@inline function reduce_sum(tile::Tile{T, S}, ::Val{axis}) where {T <: AbstractFloat, S, axis} +@inline function reduce_sum(tile::Tile{T, S}, ::Val{axis}) where {T, S, axis} Intrinsics.reduce_sum(tile, Val(axis - 1)) end @@ -546,10 +546,10 @@ Maximum reduction along the specified axis (1-indexed). maxes = ct.reduce_max(tile, 2) # Max along axis 2 ``` """ -@inline function reduce_max(tile::Tile{T, S}, axis::Integer) where {T <: AbstractFloat, S} +@inline function reduce_max(tile::Tile{T, S}, axis::Integer) where {T, S} Intrinsics.reduce_max(tile, Val(axis - 1)) end -@inline function reduce_max(tile::Tile{T, S}, ::Val{axis}) where {T <: AbstractFloat, S, axis} +@inline function reduce_max(tile::Tile{T, S}, ::Val{axis}) where {T, S, axis} Intrinsics.reduce_max(tile, Val(axis - 1)) end diff --git a/test/codegen.jl b/test/codegen.jl index ae4b42e..c189310 100644 --- a/test/codegen.jl +++ b/test/codegen.jl @@ -387,6 +387,62 @@ end end + # Integer reduce_sum (Int32) + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Int32,2,spec2d}, ct.TileArray{Int32,1,spec1d}}) do a, b + pid = ct.bid(1) + tile = ct.load(a, pid, (4, 16)) + @check "reduce" + @check "addi" + sums = ct.reduce_sum(tile, 2) + ct.store(b, pid, sums) + return + end + end + + # Integer reduce_max (Int32) + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Int32,2,spec2d}, ct.TileArray{Int32,1,spec1d}}) do a, b + pid = ct.bid(1) + tile = ct.load(a, pid, (4, 16)) + @check "reduce" + @check "maxi" + maxes = ct.reduce_max(tile, 2) + ct.store(b, pid, maxes) + return + end + end + + # Unsigned reduce_sum (UInt32) + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{UInt32,2,spec2d}, ct.TileArray{UInt32,1,spec1d}}) do a, b + pid = ct.bid(1) + tile = ct.load(a, pid, (4, 16)) + @check "reduce" + @check "addi" + sums = ct.reduce_sum(tile, 2) + ct.store(b, pid, sums) + return + end + end + + # Unsigned reduce_max (UInt32) + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{UInt32,2,spec2d}, ct.TileArray{UInt32,1,spec1d}}) do a, b + pid = ct.bid(1) + tile = ct.load(a, pid, (4, 16)) + @check "reduce" + @check "maxi" + maxes = ct.reduce_max(tile, 2) + ct.store(b, pid, maxes) + return + end + end + @testset "select" begin @test @filecheck begin @check_label "entry" diff --git a/test/execution.jl b/test/execution.jl index 8297a9d..611d35d 100644 --- a/test/execution.jl +++ b/test/execution.jl @@ -1842,6 +1842,107 @@ end end end +# Kernel factory for reduce operations - extendable pattern +function makeReduceKernel(::Type{T}, op::Symbol) where {T} + reduceFunc = if op == :reduce_sum + ct.reduce_sum + elseif op == :reduce_max + ct.reduce_max + # ADD NEW OPERATIONS HERE + # elseif op == :reduce_min + # ct.reduce_min + # elseif op == :reduce_mul + # ct.reduce_mul + end + + @inline function kernel(a::ct.TileArray{T,1}, b::ct.TileArray{T,1}, tileSz::ct.Constant{Int}) + ct.store(b, ct.bid(1), reduceFunc(ct.load(a, ct.bid(1), (tileSz[],)), Val(1))) + return nothing + end + return kernel +end + +# CPU reference implementation for reduce operations - extendable pattern +function cpu_reduce(a_reshaped::AbstractArray{T}, op::Symbol) where {T} + if op == :reduce_sum + result = sum(a_reshaped, dims=1)[:] + # For unsigned types, apply mask to handle overflow + if T <: Unsigned + result .= result .& typemax(T) + end + return result + elseif op == :reduce_max + return maximum(a_reshaped, dims=1)[:] + # ADD NEW OPERATIONS HERE + # elseif op == :reduce_min + # return minimum(a_reshaped, dims=1)[:] + # elseif op == :reduce_mul + # return prod(a_reshaped, dims=1)[:] + end +end + +@testset "1D reduce operations (extendable)" begin + # Test parameters - easily extendable + TILE_SIZE = 32 + N = 1024 + + # Supported types - add new types here + TEST_TYPES = [Int8, Int16, Int32, Int64, UInt16, UInt32, UInt64, Float16, Float32, Float64] + + # Supported operations - add new operations here + TEST_OPS = [:reduce_sum, :reduce_max] + + @testset "Type: $elType, Operation: $op" for elType in TEST_TYPES, op in TEST_OPS + # Create kernel using factory + reduceKernel = try + makeReduceKernel(elType, op) + catch e + @test_broken false + rethrow() + end + + # Generate input data with type-appropriate ranges + # Int8: -3 to 3 (32 * 3 = 96, safely within Int8 range -128 to 127) + # Int16: -800 to 800 (32 * 800 = 25,600, safely within Int16 range -32,768 to 32,767) + # UInt16: 1 to 2000 (32 * 2000 = 64,000, safely within UInt16 range 0 to 65,535) + # Larger types: -1000 to 1000 (arbitrary but covers positive/negative) + # Floats: 0 to 1 (CUDA.rand default) + if elType == Int8 + a_gpu = CuArray{Int8}(rand(-3:3, N)) + elseif elType == Int16 + a_gpu = CuArray{Int16}(rand(-800:800, N)) + elseif elType == UInt16 + a_gpu = CuArray{UInt16}(rand(1:2000, N)) + elseif elType <: Integer && elType <: Signed + a_gpu = CuArray{elType}(rand(-1000:1000, N)) + else + a_gpu = CUDA.rand(elType, N) + end + b_gpu = CUDA.zeros(elType, cld(N, TILE_SIZE)) + + # Launch kernel + try + CUDA.@sync ct.launch(reduceKernel, cld(N, TILE_SIZE), a_gpu, b_gpu, ct.Constant(TILE_SIZE)) + catch e + @test_broken false + rethrow() + end + + # Verify results + a_cpu = Array(a_gpu) + b_cpu = Array(b_gpu) + a_reshaped = reshape(a_cpu, TILE_SIZE, :) + cpu_result = cpu_reduce(a_reshaped, op) + + # Use appropriate comparison based on type + if elType <: AbstractFloat + @test b_cpu ≈ cpu_result rtol=1e-3 + else + @test b_cpu == cpu_result + end + end +end + @testset "transpose with hints" begin function transpose_with_hints(x::ct.TileArray{Float32,2}, y::ct.TileArray{Float32,2}) From 5088d5411d5402c1a636f16f0e815a00389f7970 Mon Sep 17 00:00:00 2001 From: arhik Date: Sat, 17 Jan 2026 19:33:32 +0000 Subject: [PATCH 2/8] Add Number type constraint to reduce_sum and reduce_max functions - Constrain T <: Number in reduce_sum/reduce_max signatures for type safety - Ensures only numeric types can be used with reduction operations --- src/language/operations.jl | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/language/operations.jl b/src/language/operations.jl index 5b00350..2dcb24f 100644 --- a/src/language/operations.jl +++ b/src/language/operations.jl @@ -529,10 +529,10 @@ Returns a tile with the specified dimension removed. sums = ct.reduce_sum(tile, 2) # Returns (128,) tile ``` """ -@inline function reduce_sum(tile::Tile{T, S}, axis::Integer) where {T, S} +@inline function reduce_sum(tile::Tile{T, S}, axis::Integer) where {T <: Number, S} Intrinsics.reduce_sum(tile, Val(axis - 1)) end -@inline function reduce_sum(tile::Tile{T, S}, ::Val{axis}) where {T, S, axis} +@inline function reduce_sum(tile::Tile{T, S}, ::Val{axis}) where {T <: Number, S, axis} Intrinsics.reduce_sum(tile, Val(axis - 1)) end @@ -546,10 +546,10 @@ Maximum reduction along the specified axis (1-indexed). maxes = ct.reduce_max(tile, 2) # Max along axis 2 ``` """ -@inline function reduce_max(tile::Tile{T, S}, axis::Integer) where {T, S} +@inline function reduce_max(tile::Tile{T, S}, axis::Integer) where {T <: Number, S} Intrinsics.reduce_max(tile, Val(axis - 1)) end -@inline function reduce_max(tile::Tile{T, S}, ::Val{axis}) where {T, S, axis} +@inline function reduce_max(tile::Tile{T, S}, ::Val{axis}) where {T <: Number, S, axis} Intrinsics.reduce_max(tile, Val(axis - 1)) end @@ -628,4 +628,3 @@ br = ct.extract(tile, (2, 2), (4, 4)) # Bottom-right (rows 5-8, cols 5-8) Intrinsics.extract(tile, Val(map(i -> i - 1, index)), Val(shape)) @inline extract(tile::Tile{T}, ::Val{Index}, ::Val{Shape}) where {T, Index, Shape} = Intrinsics.extract(tile, Val(map(i -> i - 1, Index)), Val(Shape)) - From dfd18df9123d2f4f52d9c68daf3fd92895ee2e4c Mon Sep 17 00:00:00 2001 From: Arhik Date: Tue, 20 Jan 2026 01:08:10 +0530 Subject: [PATCH 3/8] Refactor IntegerIdentityOp: remove redundant signed parameter - IntegerIdentityOp no longer stores signed::Bool field - signedness inferred from dtype (T <: Signed) - mask_to_width simplified to use T <: Signed for zigzag encoding - All call sites updated --- src/bytecode/writer.jl | 47 ++++++++++++++------------------- src/compiler/intrinsics/core.jl | 4 +-- 2 files changed, 22 insertions(+), 29 deletions(-) diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl index 52b693e..5c3b092 100644 --- a/src/bytecode/writer.jl +++ b/src/bytecode/writer.jl @@ -252,15 +252,14 @@ struct FloatIdentityOp <: IdentityOp end """ - IntegerIdentityOp(value, type_id, dtype, signed) + IntegerIdentityOp(value, type_id, dtype) Integer identity value for binary operations. """ struct IntegerIdentityOp <: IdentityOp value::UInt128 # Store as UInt128 to handle all unsigned values up to 64 bits type_id::TypeId - dtype::Type # Int8, Int16, Int32, Int64, UInt8, etc. - signed::Bool # true for signed, false for unsigned + dtype::Type # Int8, Int16, Int32, Int64, UInt8, etc. (signedness inferred from dtype) end """ @@ -292,8 +291,8 @@ function encode_tagged_int!(cb::CodeBuilder, identity::IntegerIdentityOp) encode_typeid!(cb.buf, identity.type_id) # Value: signed uses zigzag varint, unsigned uses plain varint # Mask value to correct bit width and apply zigzag if signed - masked_value = mask_to_width(identity.value, identity.dtype, identity.signed) - if identity.signed + masked_value = mask_to_width(identity.value, identity.dtype) + if identity.dtype <: Signed encode_signed_varint!(cb.buf, masked_value) else encode_varint!(cb.buf, masked_value) @@ -301,30 +300,24 @@ function encode_tagged_int!(cb::CodeBuilder, identity::IntegerIdentityOp) end """ - mask_to_width(value, dtype, signed) + mask_to_width(value, dtype) -Mask a UInt128 value to the correct bit width for the given type and apply zigzag if signed. +Mask a UInt128 value to the correct bit width for the given type. +Applies zigzag encoding for signed types. """ -mask_to_width(value::UInt128, ::Type{Int64}, signed::Bool) = - let masked = UInt64(value & 0xFFFFFFFFFFFFFFFF) - UInt64((masked << 1) ⊻ (masked >>> 63)) - end -mask_to_width(value::UInt128, ::Type{Int32}, signed::Bool) = - let masked = UInt32(value & 0xFFFFFFFF) - UInt32((masked << 1) ⊻ (masked >>> 31)) - end -mask_to_width(value::UInt128, ::Type{Int16}, signed::Bool) = - let masked = UInt16(value & 0xFFFF) - UInt16((masked << 1) ⊻ (masked >>> 15)) - end -mask_to_width(value::UInt128, ::Type{Int8}, signed::Bool) = - let masked = UInt8(value & 0xFF) - UInt8((masked << 1) ⊻ (masked >>> 7)) +function mask_to_width(value::UInt128, ::Type{T}) where T <: Integer + bits = sizeof(T) * 8 + mask = (UInt128(1) << bits) - 1 + masked = value & mask + U = unsigned(T) + unsigned_masked = U(masked) + if T <: Signed + U((unsigned_masked << 1) ⊻ (unsigned_masked >>> (bits - 1))) + else + unsigned_masked end -mask_to_width(value::UInt128, ::Type{UInt64}, signed::Bool) = UInt64(value & 0xFFFFFFFFFFFFFFFF) -mask_to_width(value::UInt128, ::Type{UInt32}, signed::Bool) = UInt32(value & 0xFFFFFFFF) -mask_to_width(value::UInt128, ::Type{UInt16}, signed::Bool) = UInt16(value & 0xFFFF) -mask_to_width(value::UInt128, ::Type{UInt8}, signed::Bool) = UInt8(value & 0xFF) +end + """ float_to_bits(value, dtype) @@ -612,7 +605,7 @@ function finalize_function!(func_buf::Vector{UInt8}, cb::CodeBuilder, end #============================================================================= - Optimization Hints + Optimization Hints =============================================================================# """ diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl index 61a4a0b..699da26 100644 --- a/src/compiler/intrinsics/core.jl +++ b/src/compiler/intrinsics/core.jl @@ -609,13 +609,13 @@ Identity must satisfy: identity ⊕ x = x for the operation. operation_identity(::Val{:add}, dtype, ::Type{T}) where T <: AbstractFloat = FloatIdentityOp(zero(T), dtype, T) operation_identity(::Val{:add}, dtype, ::Type{T}) where T <: Integer = - IntegerIdentityOp(to_uint128(zero(T)), dtype, T, T <: Signed) + IntegerIdentityOp(to_uint128(zero(T)), dtype, T) # Maximum identity: max(typemin(T), x) = x operation_identity(::Val{:max}, dtype, ::Type{T}) where T <: AbstractFloat = FloatIdentityOp(typemin(T), dtype, T) operation_identity(::Val{:max}, dtype, ::Type{T}) where T <: Integer = - IntegerIdentityOp(to_uint128(typemin(T)), dtype, T, T <: Signed) + IntegerIdentityOp(to_uint128(typemin(T)), dtype, T) #=============================================================================# # Reduce Body Operations - dispatch on Val{fn} and elem_type From a84cfe3e5aa0ca67c19b91f2a2e015ecc8d921ac Mon Sep 17 00:00:00 2001 From: Arhik Date: Tue, 20 Jan 2026 01:14:29 +0530 Subject: [PATCH 4/8] Simplify encode_tagged_int! by removing branching - Zigzag encoding now handled entirely in mask_to_width - Single encode_varint! call regardless of signedness --- src/bytecode/writer.jl | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl index 5c3b092..66e0e4f 100644 --- a/src/bytecode/writer.jl +++ b/src/bytecode/writer.jl @@ -289,14 +289,9 @@ function encode_tagged_int!(cb::CodeBuilder, identity::IntegerIdentityOp) push!(cb.buf, 0x01) # Type ID encode_typeid!(cb.buf, identity.type_id) - # Value: signed uses zigzag varint, unsigned uses plain varint - # Mask value to correct bit width and apply zigzag if signed + # Mask value to correct bit width and apply zigzag encoding for signed types masked_value = mask_to_width(identity.value, identity.dtype) - if identity.dtype <: Signed - encode_signed_varint!(cb.buf, masked_value) - else - encode_varint!(cb.buf, masked_value) - end + encode_varint!(cb.buf, masked_value) end """ @@ -311,7 +306,7 @@ function mask_to_width(value::UInt128, ::Type{T}) where T <: Integer masked = value & mask U = unsigned(T) unsigned_masked = U(masked) - if T <: Signed + if T <: Signed # do zig-zag encoding U((unsigned_masked << 1) ⊻ (unsigned_masked >>> (bits - 1))) else unsigned_masked From bee18e2bb8634e89b3cf564da07e199a8f2157bc Mon Sep 17 00:00:00 2001 From: Arhik Date: Tue, 20 Jan 2026 02:04:15 +0530 Subject: [PATCH 5/8] Refactor encode_reduce_body to use if/else instead of dispatch Single-use functionality doesn't benefit from dispatch-based selection. --- src/bytecode/writer.jl | 2 +- src/compiler/intrinsics/core.jl | 26 +++++++++++++++++--------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl index 66e0e4f..713a11e 100644 --- a/src/bytecode/writer.jl +++ b/src/bytecode/writer.jl @@ -291,7 +291,7 @@ function encode_tagged_int!(cb::CodeBuilder, identity::IntegerIdentityOp) encode_typeid!(cb.buf, identity.type_id) # Mask value to correct bit width and apply zigzag encoding for signed types masked_value = mask_to_width(identity.value, identity.dtype) - encode_varint!(cb.buf, masked_value) + encode_varint!(cb.buf, masked_value) # masked_value are already zigzag encoded end """ diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl index 699da26..6049ee7 100644 --- a/src/compiler/intrinsics/core.jl +++ b/src/compiler/intrinsics/core.jl @@ -618,17 +618,25 @@ operation_identity(::Val{:max}, dtype, ::Type{T}) where T <: Integer = IntegerIdentityOp(to_uint128(typemin(T)), dtype, T) #=============================================================================# -# Reduce Body Operations - dispatch on Val{fn} and elem_type +# Reduce Body Operations #=============================================================================# -encode_reduce_body(cb, type, acc, elem, ::Val{:add}, ::Type{T}) where T <: AbstractFloat = - encode_AddFOp!(cb, type, acc, elem) -encode_reduce_body(cb, type, acc, elem, ::Val{:max}, ::Type{T}) where T <: AbstractFloat = - encode_MaxFOp!(cb, type, acc, elem) -encode_reduce_body(cb, type, acc, elem, ::Val{:add}, ::Type{T}) where T <: Integer = - encode_AddIOp!(cb, type, acc, elem) -encode_reduce_body(cb, type, acc, elem, ::Val{:max}, ::Type{T}) where T <: Integer = - encode_MaxIOp!(cb, type, acc, elem; signedness=T <: Signed ? SignednessSigned : SignednessUnsigned) +function encode_reduce_body(cb, type, acc, elem, op::Val, ::Type{T}) where T + if T <: AbstractFloat + if op == Val{:add} + encode_AddFOp!(cb, type, acc, elem) + else # Val{:max} + encode_MaxFOp!(cb, type, acc, elem) + end + else # Integer + signedness = T <: Signed ? SignednessSigned : SignednessUnsigned + if op == Val{:add} + encode_AddIOp!(cb, type, acc, elem) + else # Val{:max} + encode_MaxIOp!(cb, type, acc, elem; signedness) + end + end +end # cuda_tile.reshape From 449fe8582941916210397a4d19ba371fc8cb3171 Mon Sep 17 00:00:00 2001 From: Arhik Date: Tue, 20 Jan 2026 02:07:17 +0530 Subject: [PATCH 6/8] Use Symbol instead of Val{} in encode_reduce_body --- src/compiler/intrinsics/core.jl | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl index 6049ee7..edba2b3 100644 --- a/src/compiler/intrinsics/core.jl +++ b/src/compiler/intrinsics/core.jl @@ -569,7 +569,7 @@ function emit_reduce!(ctx::CGCtx, args, reduce_fn::Symbol) results = encode_ReduceOp!(cb, [output_tile_type], [input_tv.v], axis, [identity], [scalar_tile_type]) do block_args acc, elem = block_args[1], block_args[2] - res = encode_reduce_body(cb, scalar_tile_type, acc, elem, Val(reduce_fn), elem_type) + res = encode_reduce_body(cb, scalar_tile_type, acc, elem, reduce_fn, elem_type) encode_YieldOp!(cb, [res]) end @@ -620,19 +620,18 @@ operation_identity(::Val{:max}, dtype, ::Type{T}) where T <: Integer = #=============================================================================# # Reduce Body Operations #=============================================================================# - -function encode_reduce_body(cb, type, acc, elem, op::Val, ::Type{T}) where T +function encode_reduce_body(cb, type, acc, elem, op::Symbol, ::Type{T}) where T if T <: AbstractFloat - if op == Val{:add} + if op == :add encode_AddFOp!(cb, type, acc, elem) - else # Val{:max} + else # :max encode_MaxFOp!(cb, type, acc, elem) end else # Integer signedness = T <: Signed ? SignednessSigned : SignednessUnsigned - if op == Val{:add} + if op == :add encode_AddIOp!(cb, type, acc, elem) - else # Val{:max} + else # :max encode_MaxIOp!(cb, type, acc, elem; signedness) end end From 48ea44bbc90a2f05cab538b80b7821fc1c20d8a8 Mon Sep 17 00:00:00 2001 From: Arhik Date: Tue, 20 Jan 2026 02:09:08 +0530 Subject: [PATCH 7/8] Be explicit about :max branch in encode_reduce_body --- src/compiler/intrinsics/core.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl index edba2b3..f2c0f83 100644 --- a/src/compiler/intrinsics/core.jl +++ b/src/compiler/intrinsics/core.jl @@ -624,14 +624,14 @@ function encode_reduce_body(cb, type, acc, elem, op::Symbol, ::Type{T}) where T if T <: AbstractFloat if op == :add encode_AddFOp!(cb, type, acc, elem) - else # :max + elseif op == :max encode_MaxFOp!(cb, type, acc, elem) end else # Integer signedness = T <: Signed ? SignednessSigned : SignednessUnsigned if op == :add encode_AddIOp!(cb, type, acc, elem) - else # :max + elseif op == :max encode_MaxIOp!(cb, type, acc, elem; signedness) end end From eb6bd519f6589d3be0e9b1e3f41745be13912538 Mon Sep 17 00:00:00 2001 From: Arhik Date: Wed, 21 Jan 2026 18:41:55 +0530 Subject: [PATCH 8/8] better name for IdentityOp The original name `IdentityOp` could be misleading representation for TileIR Op. New name better represent that its rather a value. - IdentityOp -> IdentityVal --- src/bytecode/encodings.jl | 2 +- src/bytecode/writer.jl | 26 +++++++++++++------------- src/compiler/intrinsics.jl | 2 +- src/compiler/intrinsics/core.jl | 14 +++++++------- src/cuTile.jl | 2 +- 5 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/bytecode/encodings.jl b/src/bytecode/encodings.jl index 9d20820..1e1672a 100644 --- a/src/bytecode/encodings.jl +++ b/src/bytecode/encodings.jl @@ -1291,7 +1291,7 @@ function encode_ReduceOp!(body::Function, cb::CodeBuilder, result_types::Vector{TypeId}, operands::Vector{Value}, dim::Int, - identities::Vector{<:IdentityOp}, + identities::Vector{<:IdentityVal}, body_scalar_types::Vector{TypeId}) encode_varint!(cb.buf, Opcode.ReduceOp) diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl index 713a11e..19beb8e 100644 --- a/src/bytecode/writer.jl +++ b/src/bytecode/writer.jl @@ -234,41 +234,41 @@ end =============================================================================# """ - IdentityOp + IdentityVal Abstract type for binary operation identity attributes (reduce, scan, etc.). """ -abstract type IdentityOp end +abstract type IdentityVal end """ - FloatIdentityOp(value, type_id, dtype) + FloatIdentityVal(value, type_id, dtype) Float identity value for binary operations. """ -struct FloatIdentityOp <: IdentityOp +struct FloatIdentityVal <: IdentityVal value::Float64 type_id::TypeId dtype::Type # Float16, Float32, Float64, etc. end """ - IntegerIdentityOp(value, type_id, dtype) + IntegerIdentityVal(value, type_id, dtype) Integer identity value for binary operations. """ -struct IntegerIdentityOp <: IdentityOp +struct IntegerIdentityVal <: IdentityVal value::UInt128 # Store as UInt128 to handle all unsigned values up to 64 bits type_id::TypeId dtype::Type # Int8, Int16, Int32, Int64, UInt8, etc. (signedness inferred from dtype) end """ - encode_tagged_float!(cb, identity::FloatIdentityOp) + encode_tagged_float!(cb, identity::FloatIdentityVal) Encode a tagged float attribute for reduce identity. Format: tag(Float=0x02) + typeid + ap_int(value_bits) """ -function encode_tagged_float!(cb::CodeBuilder, identity::FloatIdentityOp) +function encode_tagged_float!(cb::CodeBuilder, identity::FloatIdentityVal) # Tag for Float attribute push!(cb.buf, 0x02) # Type ID @@ -279,12 +279,12 @@ function encode_tagged_float!(cb::CodeBuilder, identity::FloatIdentityOp) end """ - encode_tagged_int!(cb, identity::IntegerIdentityOp) + encode_tagged_int!(cb, identity::IntegerIdentityVal) Encode a tagged integer identity attribute. Format: tag(Int=0x01) + typeid + ap_int(value) """ -function encode_tagged_int!(cb::CodeBuilder, identity::IntegerIdentityOp) +function encode_tagged_int!(cb::CodeBuilder, identity::IntegerIdentityVal) # Tag for Int attribute push!(cb.buf, 0x01) # Type ID @@ -354,7 +354,7 @@ end Encode an array of binary operation identity attributes. Dispatches on identity type to encode correctly. """ -function encode_identity_array!(cb::CodeBuilder, identities::Vector{<:IdentityOp}) +function encode_identity_array!(cb::CodeBuilder, identities::Vector{<:IdentityVal}) encode_varint!(cb.buf, length(identities)) for identity in identities encode_identity!(cb, identity) @@ -366,8 +366,8 @@ end Encode a single identity attribute, dispatching on type. """ -encode_identity!(cb::CodeBuilder, identity::FloatIdentityOp) = encode_tagged_float!(cb, identity) -encode_identity!(cb::CodeBuilder, identity::IntegerIdentityOp) = encode_tagged_int!(cb, identity) +encode_identity!(cb::CodeBuilder, identity::FloatIdentityVal) = encode_tagged_float!(cb, identity) +encode_identity!(cb::CodeBuilder, identity::IntegerIdentityVal) = encode_tagged_int!(cb, identity) """ BytecodeWriter diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl index e522141..45dc34f 100644 --- a/src/compiler/intrinsics.jl +++ b/src/compiler/intrinsics.jl @@ -8,7 +8,7 @@ using Base: compilerbarrier, donotdelete using ..cuTile: Tile, TileArray, Constant, TensorView, PartitionView using ..cuTile: Signedness, SignednessSigned, SignednessUnsigned using ..cuTile: ComparisonPredicate, CmpLessThan, CmpLessThanOrEqual, CmpGreaterThan, CmpGreaterThanOrEqual, CmpEqual, CmpNotEqual -using ..cuTile: IdentityOp, FloatIdentityOp, IntegerIdentityOp +using ..cuTile: IdentityVal, FloatIdentityVal, IntegerIdentityVal end diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl index f2c0f83..91e1dcf 100644 --- a/src/compiler/intrinsics/core.jl +++ b/src/compiler/intrinsics/core.jl @@ -581,10 +581,10 @@ end #=============================================================================# """ - operation_identity(fn, dtype, elem_type) -> IdentityOp + operation_identity(fn, dtype, elem_type) -> IdentityVal to_uint128(value) -Convert an integer value to UInt128 for storage in IntegerIdentityOp. +Convert an integer value to UInt128 for storage in IntegerIdentityVal. For signed types, this returns the two's complement bit representation. """ # Unsigned types: directly convert @@ -599,7 +599,7 @@ to_uint128(value::Int16) = UInt128(reinterpret(UInt16, value)) to_uint128(value::Int8) = UInt128(reinterpret(UInt8, value)) """ - operation_identity(fn, dtype, elem_type) -> IdentityOp + operation_identity(fn, dtype, elem_type) -> IdentityVal Return the identity value for a binary operation (reduce, scan, etc.). Identity must satisfy: identity ⊕ x = x for the operation. @@ -607,15 +607,15 @@ Identity must satisfy: identity ⊕ x = x for the operation. # Addition identity: 0 + x = x operation_identity(::Val{:add}, dtype, ::Type{T}) where T <: AbstractFloat = - FloatIdentityOp(zero(T), dtype, T) + FloatIdentityVal(zero(T), dtype, T) operation_identity(::Val{:add}, dtype, ::Type{T}) where T <: Integer = - IntegerIdentityOp(to_uint128(zero(T)), dtype, T) + IntegerIdentityVal(to_uint128(zero(T)), dtype, T) # Maximum identity: max(typemin(T), x) = x operation_identity(::Val{:max}, dtype, ::Type{T}) where T <: AbstractFloat = - FloatIdentityOp(typemin(T), dtype, T) + FloatIdentityVal(typemin(T), dtype, T) operation_identity(::Val{:max}, dtype, ::Type{T}) where T <: Integer = - IntegerIdentityOp(to_uint128(typemin(T)), dtype, T) + IntegerIdentityVal(to_uint128(typemin(T)), dtype, T) #=============================================================================# # Reduce Body Operations diff --git a/src/cuTile.jl b/src/cuTile.jl index a3b019d..7cb13b4 100644 --- a/src/cuTile.jl +++ b/src/cuTile.jl @@ -41,6 +41,6 @@ public launch launch() = error("Please import CUDA.jl before using `cuTile.launch`.") # Export identity types for reduction operations -public IdentityOp, FloatIdentityOp, IntegerIdentityOp +public IdentityVal, FloatIdentityVal, IntegerIdentityVal end # module cuTile