JuliaGPU · arhik · Jan 11, 2026 · Jan 16, 2026 · Jan 11, 2026 · Jan 11, 2026
diff --git a/README.md b/README.md
@@ -160,7 +160,12 @@ conservative token threading in the compiler (see https://github.com/JuliaGPU/cu
 | Operation | Description |
 |-----------|-------------|
 | `reduce_sum(tile, axis)` | Sum along axis |
+| `reduce_mul(tile, axis)` | Product along axis |
 | `reduce_max(tile, axis)` | Maximum along axis |
+| `reduce_min(tile, axis)` | Minimum along axis |
+| `reduce_and(tile, axis)` | Bitwise AND along axis (integer) |
+| `reduce_or(tile, axis)` | Bitwise OR along axis (integer) |
+| `reduce_xor(tile, axis)` | Bitwise XOR along axis (integer) |
 
 ### Math
 | Operation | Description |
@@ -275,6 +280,11 @@ ct.permute(tile, (3, 1, 2))
 
 This applies to `bid`, `num_blocks`, `permute`, `reshape`, dimension arguments, etc.
 
+### axis convenience
+
+| `axis(i)` | Convert 1-based axis to 0-based (helper) |
+
+
 ### `Val`-like constants
 
 CuTile.jl uses `ct.Constant{T}` to encode compile-time constant values in the type domain, similar to how `Val` works. An explicit `[]` is needed to extract the value at runtime:

diff --git a/examples/reducekernel.jl b/examples/reducekernel.jl
@@ -0,0 +1,153 @@
+using Test
+using CUDA
+using cuTile
+import cuTile as ct
+
+# Kernel factory to properly capture element type and operation
+function makeReduceKernel(::Type{T}, op::Symbol) where {T}
+    reduceFunc = if op == :reduce_min
+        ct.reduce_min
+    elseif op == :reduce_max
+        ct.reduce_max
+    elseif op == :reduce_sum
+        ct.reduce_sum
+    elseif op == :reduce_xor
+        ct.reduce_xor
+    elseif op == :reduce_or
+        ct.reduce_or
+    elseif op == :reduce_and
+        ct.reduce_and
+    end
+
+    @inline function kernel(a::ct.TileArray{T,1}, b::ct.TileArray{T,1}, tileSz::ct.Constant{Int})
+        ct.store(b, ct.bid(1), reduceFunc(ct.load(a, ct.bid(1), (tileSz[],)), Val(1)))
+        return nothing
+    end
+    return kernel
+end
+
+# Test with UInt types
+@testset for elType in [UInt16, UInt32, UInt64]
+    @testset for op in [:reduce_min, :reduce_max, :reduce_sum, :reduce_xor, :reduce_or, :reduce_and]
+        sz = 32
+        N = 2^15
+
+        # Create kernel using factory
+        reduceKernel = try
+            makeReduceKernel(elType, op)
+        catch e
+            @test_broken false
+            rethrow()
+        end
+
+        # Create data and run kernel
+        a_gpu = CUDA.rand(elType, N)
+        b_gpu = CUDA.zeros(elType, cld(N, sz))
+        try
+            CUDA.@sync ct.launch(reduceKernel, cld(length(a_gpu), sz), a_gpu, b_gpu, ct.Constant(sz))
+        catch e
+            @test_broken false
+            rethrow()
+        end
+        res = Array(b_gpu)
+
+        # CPU computation
+        a_cpu = Array(a_gpu)
+        a_reshaped = reshape(a_cpu, sz, :)
+
+        if op == :reduce_min
+            cpu_result = minimum(a_reshaped, dims=1)[:]
+        elseif op == :reduce_max
+            cpu_result = maximum(a_reshaped, dims=1)[:]
+        elseif op == :reduce_sum
+            raw_sum = sum(a_reshaped, dims=1)[:]
+            cpu_result = raw_sum .& typemax(elType)
+        elseif op == :reduce_xor
+            cpu_result = mapslices(x -> reduce(⊻, x), a_reshaped, dims=1)[:]
+        elseif op == :reduce_or
+            cpu_result = mapslices(x -> reduce(|, x), a_reshaped, dims=1)[:]
+        elseif op == :reduce_and
+            cpu_result = mapslices(x -> reduce(&, x), a_reshaped, dims=1)[:]
+        end
+
+        @test cpu_result == res
+    end
+end
+
+# Test with signed Int types
+@testset for elType in [Int16, Int32, Int64]
+    @testset for op in [:reduce_min, :reduce_max, :reduce_sum, :reduce_xor, :reduce_or, :reduce_and]
+        sz = 32
+        N = 2^15
+
+        # Create kernel using factory
+        reduceKernel = try
+            makeReduceKernel(elType, op)
+        catch e
+            @test_broken false
+            rethrow()
+        end
+
+        # Create data and run kernel - use range to get negative values too
+        a_gpu = CuArray{elType}(rand(-1000:1000, N))
+        b_gpu = CUDA.zeros(elType, cld(N, sz))
+        try
+            CUDA.@sync ct.launch(reduceKernel, cld(length(a_gpu), sz), a_gpu, b_gpu, ct.Constant(sz))
+        catch e
+            @test_broken false
+            rethrow()
+        end
+        res = Array(b_gpu)
+
+        # CPU computation
+        a_cpu = Array(a_gpu)
+        a_reshaped = reshape(a_cpu, sz, :)
+
+        if op == :reduce_min
+            cpu_result = minimum(a_reshaped, dims=1)[:]
+        elseif op == :reduce_max
+            cpu_result = maximum(a_reshaped, dims=1)[:]
+        elseif op == :reduce_sum
+            cpu_result = sum(a_reshaped, dims=1)[:]
+        elseif op == :reduce_xor
+            cpu_result = mapslices(x -> reduce(⊻, x), a_reshaped, dims=1)[:]
+        elseif op == :reduce_or
+            cpu_result = mapslices(x -> reduce(|, x), a_reshaped, dims=1)[:]
+        elseif op == :reduce_and
+            cpu_result = mapslices(x -> reduce(&, x), a_reshaped, dims=1)[:]
+        end
+
+        @test cpu_result == res
+    end
+end
+
+# Test with Float types
+@testset for elType in [Float16, Float32, Float64]
+    @testset for op in [:reduce_min, :reduce_max, :reduce_sum]
+        sz = 32
+        N = 2^15
+
+        # Create kernel using factory
+        reduceKernel = makeReduceKernel(elType, op)
+
+        # Create data and run kernel
+        a_gpu = CUDA.rand(elType, N)
+        b_gpu = CUDA.zeros(elType, cld(N, sz))
+        CUDA.@sync ct.launch(reduceKernel, cld(length(a_gpu), sz), a_gpu, b_gpu, ct.Constant(sz))
+        res = Array(b_gpu)
+
+        # CPU computation
+        a_cpu = Array(a_gpu)
+        a_reshaped = reshape(a_cpu, sz, :)
+
+        if op == :reduce_min
+            cpu_result = minimum(a_reshaped, dims=1)[:]
+        elseif op == :reduce_max
+            cpu_result = maximum(a_reshaped, dims=1)[:]
+        elseif op == :reduce_sum
+            cpu_result = sum(a_reshaped, dims=1)[:]
+        end
+
+        @test isapprox(cpu_result, res)
+    end
+end
diff --git a/src/bytecode/encodings.jl b/src/bytecode/encodings.jl
@@ -1291,7 +1291,7 @@ function encode_ReduceOp!(body::Function, cb::CodeBuilder,
                           result_types::Vector{TypeId},
                           operands::Vector{Value},
                           dim::Int,
-                          identities::Vector{<:ReduceIdentity},
+                          identities::Vector{<:IdentityOp},
                           body_scalar_types::Vector{TypeId})
     encode_varint!(cb.buf, Opcode.ReduceOp)
 

diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
@@ -234,30 +234,42 @@ end
 =============================================================================#
 
 """
-    ReduceIdentity
+    IdentityOp
 
-Abstract type for reduce identity attributes.
+Abstract type for binary operation identity attributes (reduce, scan, etc.).
 """
-abstract type ReduceIdentity end
+abstract type IdentityOp end
 
 """
-    FloatIdentity(value, type_id, dtype)
+    FloatIdentityOp(value, type_id, dtype)
 
-Float identity value for reduce operations.
+Float identity value for binary operations.
 """
-struct FloatIdentity <: ReduceIdentity
+struct FloatIdentityOp <: IdentityOp
     value::Float64
     type_id::TypeId
     dtype::Type  # Float16, Float32, Float64, etc.
 end
 
 """
-    encode_tagged_float!(cb, identity::FloatIdentity)
+    IntegerIdentityOp(value, type_id, dtype, signed)
+
+Integer identity value for binary operations.
+"""
+struct IntegerIdentityOp <: IdentityOp
+    value::UInt128  # Store as UInt128 to handle all unsigned values up to 64 bits
+    type_id::TypeId
+    dtype::Type   # Int8, Int16, Int32, Int64, UInt8, etc.
+    signed::Bool  # true for signed, false for unsigned
+end
+
+"""
+    encode_tagged_float!(cb, identity::FloatIdentityOp)
 
 Encode a tagged float attribute for reduce identity.
 Format: tag(Float=0x02) + typeid + ap_int(value_bits)
 """
-function encode_tagged_float!(cb::CodeBuilder, identity::FloatIdentity)
+function encode_tagged_float!(cb::CodeBuilder, identity::FloatIdentityOp)
     # Tag for Float attribute
     push!(cb.buf, 0x02)
     # Type ID
@@ -267,6 +279,59 @@ function encode_tagged_float!(cb::CodeBuilder, identity::FloatIdentity)
     encode_signed_varint!(cb.buf, bits)
 end
 
+"""
+    encode_tagged_int!(cb, identity::IntegerIdentityOp)
+
+Encode a tagged integer identity attribute.
+Format: tag(Int=0x01) + typeid + ap_int(value)
+"""
+function encode_tagged_int!(cb::CodeBuilder, identity::IntegerIdentityOp)
+    # Tag for Int attribute
+    push!(cb.buf, 0x01)
+    # Type ID
+    encode_typeid!(cb.buf, identity.type_id)
+    # Value: signed uses zigzag varint, unsigned uses plain varint
+    # Mask value to correct bit width and apply zigzag for signed types
+    masked_value = mask_to_width(identity.value, identity.dtype, identity.signed)
+    if identity.signed
+        encode_signed_varint!(cb.buf, masked_value)
+    else
+        encode_varint!(cb.buf, masked_value)
+    end
+end
+
+"""
+    mask_to_width(value, dtype, signed)
+
+Mask a UInt128 value to the correct bit width for the given type and apply zigzag if signed.
+For signed types, this masks first, then applies zigzag encoding.
+"""
+# Signed Int64: mask to 64 bits first, then zigzag encode
+mask_to_width(value::UInt128, ::Type{Int64}, signed::Bool) = 
+    let masked = UInt64(value & 0xFFFFFFFFFFFFFFFF)
+        UInt64((masked << 1) ⊻ (masked >>> 63))
+    end
+# Signed Int32: mask to 32 bits first, then zigzag encode
+mask_to_width(value::UInt128, ::Type{Int32}, signed::Bool) = 
+    let masked = UInt32(value & 0xFFFFFFFF)
+        UInt32((masked << 1) ⊻ (masked >>> 31))
+    end
+# Signed Int16: mask to 16 bits first, then zigzag encode
+mask_to_width(value::UInt128, ::Type{Int16}, signed::Bool) = 
+    let masked = UInt16(value & 0xFFFF)
+        UInt16((masked << 1) ⊻ (masked >>> 15))
+    end
+# Signed Int8: mask to 8 bits first, then zigzag encode
+mask_to_width(value::UInt128, ::Type{Int8}, signed::Bool) = 
+    let masked = UInt8(value & 0xFF)
+        UInt8((masked << 1) ⊻ (masked >>> 7))
+    end
+# Unsigned types: just mask to bit width, no zigzag
+mask_to_width(value::UInt128, ::Type{UInt64}, signed::Bool) = UInt64(value & 0xFFFFFFFFFFFFFFFF)
+mask_to_width(value::UInt128, ::Type{UInt32}, signed::Bool) = UInt32(value & 0xFFFFFFFF)
+mask_to_width(value::UInt128, ::Type{UInt16}, signed::Bool) = UInt16(value & 0xFFFF)
+mask_to_width(value::UInt128, ::Type{UInt8}, signed::Bool) = UInt8(value & 0xFF)
+
 """
     float_to_bits(value, dtype)
 
@@ -296,6 +361,7 @@ end
 Encode a signed integer as a variable-length integer.
 Uses zigzag encoding for signed values.
 """
+
 function encode_signed_varint!(buf::Vector{UInt8}, value::Union{UInt16, UInt32, UInt64, Int64})
     # For float bits, encode as unsigned varint
     encode_varint!(buf, UInt64(value))
@@ -304,15 +370,24 @@ end
 """
     encode_identity_array!(cb, identities)
 
-Encode an array of reduce identity attributes.
+Encode an array of binary operation identity attributes.
+Dispatches on identity type to encode correctly.
 """
-function encode_identity_array!(cb::CodeBuilder, identities::Vector{<:ReduceIdentity})
+function encode_identity_array!(cb::CodeBuilder, identities::Vector{<:IdentityOp})
     encode_varint!(cb.buf, length(identities))
     for identity in identities
-        encode_tagged_float!(cb, identity)
+        encode_identity!(cb, identity)
     end
 end
 
+"""
+    encode_identity!(cb, identity)
+
+Encode a single identity attribute, dispatching on type.
+"""
+encode_identity!(cb::CodeBuilder, identity::FloatIdentityOp) = encode_tagged_float!(cb, identity)
+encode_identity!(cb::CodeBuilder, identity::IntegerIdentityOp) = encode_tagged_int!(cb, identity)
+
 """
     BytecodeWriter
 
@@ -544,7 +619,7 @@ function finalize_function!(func_buf::Vector{UInt8}, cb::CodeBuilder,
 end
 
 #=============================================================================
- Optimization Hints 
+ Optimization Hints
 =============================================================================#
 
 """

diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl
@@ -8,6 +8,7 @@ using Base: compilerbarrier, donotdelete
 using ..cuTile: Tile, TileArray, Constant, TensorView, PartitionView
 using ..cuTile: Signedness, SignednessSigned, SignednessUnsigned
 using ..cuTile: ComparisonPredicate, CmpLessThan, CmpLessThanOrEqual, CmpGreaterThan, CmpGreaterThanOrEqual, CmpEqual, CmpNotEqual
+using ..cuTile: IdentityOp, FloatIdentityOp, IntegerIdentityOp
 
 end