From 7caaa16384ab0145deb37869efec37def97724f9 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Sun, 11 Jan 2026 18:53:32 +0530
Subject: [PATCH 01/31] intrinsics: extend reduce operations with mul, min,
 and, or, xor for both float and integer types

- Add reduce_mul, reduce_min, reduce_and, reduce_or, reduce_xor intrinsic functions
- Remove AbstractFloat constraint from reduce_sum and reduce_max
- Add corresponding emit_intrinsic! handlers for new operations
- Add integer encode_reduce_body methods for and, or, xor operations

Summary of Additions

| Function | Symbol | Types |
|----------|--------|-------|
| `reduce_sum` | `:add` | Any |
| `reduce_max` | `:max` | Any |
| `reduce_mul` | `:mul` | Any |
| `reduce_min` | `:min` | Any |
| `reduce_and` | `:and` | Integer only |
| `reduce_or` | `:or` | Integer only |
| `reduce_xor` | `:xor` | Integer only |
---
 src/compiler/intrinsics/core.jl | 114 +++++++++++++++++++++++++++++---
 1 file changed, 104 insertions(+), 10 deletions(-)

diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index c4ce4f1..a35d945 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -525,7 +525,7 @@ end
     Sum reduction along 0-indexed axis.
     Compiled to cuda_tile.reduce with ADD.
     """
-    @noinline function reduce_sum(tile::Tile{T, S}, ::Val{axis}) where {T <: AbstractFloat, S, axis}
+    @noinline function reduce_sum(tile::Tile{T, S}, ::Val{axis}) where {T, S, axis}
         reduced_shape = ntuple(i -> S[i < axis + 1 ? i : i + 1], length(S) - 1)
         Tile{T, reduced_shape}()
     end
@@ -536,7 +536,65 @@ end
     Maximum reduction along 0-indexed axis.
     Compiled to cuda_tile.reduce with MAX.
     """
-    @noinline function reduce_max(tile::Tile{T, S}, ::Val{axis}) where {T <: AbstractFloat, S, axis}
+    @noinline function reduce_max(tile::Tile{T, S}, ::Val{axis}) where {T, S, axis}
+        reduced_shape = ntuple(i -> S[i < axis + 1 ? i : i + 1], length(S) - 1)
+        Tile{T, reduced_shape}()
+    end
+
+    """
+        reduce_mul(tile, axis_val)
+
+    Product reduction along 0-indexed axis.
+    Compiled to cuda_tile.reduce with MUL.
+    """
+    @noinline function reduce_mul(tile::Tile{T, S}, ::Val{axis}) where {T, S, axis}
+        reduced_shape = ntuple(i -> S[i < axis + 1 ? i : i + 1], length(S) - 1)
+        Tile{T, reduced_shape}()
+    end
+
+    """
+        reduce_min(tile, axis_val)
+
+    Minimum reduction along 0-indexed axis.
+    Compiled to cuda_tile.reduce with MIN.
+    """
+    @noinline function reduce_min(tile::Tile{T, S}, ::Val{axis}) where {T, S, axis}
+        reduced_shape = ntuple(i -> S[i < axis + 1 ? i : i + 1], length(S) - 1)
+        Tile{T, reduced_shape}()
+    end
+
+    """
+        reduce_and(tile, axis_val)
+
+    Bitwise AND reduction along 0-indexed axis.
+    Compiled to cuda_tile.reduce with AND.
+    Integer types only.
+    """
+    @noinline function reduce_and(tile::Tile{T, S}, ::Val{axis}) where {T <: Integer, S, axis}
+        reduced_shape = ntuple(i -> S[i < axis + 1 ? i : i + 1], length(S) - 1)
+        Tile{T, reduced_shape}()
+    end
+
+    """
+        reduce_or(tile, axis_val)
+
+    Bitwise OR reduction along 0-indexed axis.
+    Compiled to cuda_tile.reduce with OR.
+    Integer types only.
+    """
+    @noinline function reduce_or(tile::Tile{T, S}, ::Val{axis}) where {T <: Integer, S, axis}
+        reduced_shape = ntuple(i -> S[i < axis + 1 ? i : i + 1], length(S) - 1)
+        Tile{T, reduced_shape}()
+    end
+
+    """
+        reduce_xor(tile, axis_val)
+
+    Bitwise XOR reduction along 0-indexed axis.
+    Compiled to cuda_tile.reduce with XOR.
+    Integer types only.
+    """
+    @noinline function reduce_xor(tile::Tile{T, S}, ::Val{axis}) where {T <: Integer, S, axis}
         reduced_shape = ntuple(i -> S[i < axis + 1 ? i : i + 1], length(S) - 1)
         Tile{T, reduced_shape}()
     end
@@ -547,6 +605,22 @@ end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.reduce_max), args)
     emit_reduce!(ctx, args, :max)
 end
+function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.reduce_mul), args)
+    emit_reduce!(ctx, args, :mul)
+end
+function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.reduce_min), args)
+    emit_reduce!(ctx, args, :min)
+end
+function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.reduce_and), args)
+    emit_reduce!(ctx, args, :and)
+end
+function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.reduce_or), args)
+    emit_reduce!(ctx, args, :or)
+end
+function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.reduce_xor), args)
+    emit_reduce!(ctx, args, :xor)
+end
+
 function emit_reduce!(ctx::CGCtx, args, reduce_fn::Symbol)
     cb = ctx.cb
     tt = ctx.tt
@@ -583,20 +657,40 @@ function emit_reduce!(ctx::CGCtx, args, reduce_fn::Symbol)
     results = encode_ReduceOp!(cb, [output_tile_type], [input_tv.v], axis, [identity], [scalar_tile_type]) do block_args
         acc, elem = block_args[1], block_args[2]
 
-        if reduce_fn == :add
-            res = encode_AddFOp!(cb, scalar_tile_type, acc, elem)
-        elseif reduce_fn == :max
-            res = encode_MaxFOp!(cb, scalar_tile_type, acc, elem)
-        else
-            error("Unsupported reduction function: $reduce_fn")
-        end
-
+        res = encode_reduce_body(cb, scalar_tile_type, acc, elem, Val(reduce_fn), elem_type)
         encode_YieldOp!(cb, [res])
     end
 
     CGVal(results[1], output_tile_type, Tile{elem_type, Tuple(output_shape)}, output_shape)
 end
 
+# Dispatch helpers for reduce body operations - dispatch on Val{fn} and elem_type
+encode_reduce_body(cb, type, acc, elem, ::Val{:add}, ::Type{T}) where T <: AbstractFloat =
+    encode_AddFOp!(cb, type, acc, elem)
+encode_reduce_body(cb, type, acc, elem, ::Val{:max}, ::Type{T}) where T <: AbstractFloat =
+    encode_MaxFOp!(cb, type, acc, elem)
+encode_reduce_body(cb, type, acc, elem, ::Val{:mul}, ::Type{T}) where T <: AbstractFloat =
+    encode_MulFOp!(cb, type, acc, elem)
+encode_reduce_body(cb, type, acc, elem, ::Val{:min}, ::Type{T}) where T <: AbstractFloat =
+    encode_MinFOp!(cb, type, acc, elem)
+encode_reduce_body(cb, type, acc, elem, ::Val{:add}, ::Type{T}) where T <: Integer =
+    encode_AddIOp!(cb, type, acc, elem)
+encode_reduce_body(cb, type, acc, elem, ::Val{:max}, ::Type{T}) where T <: Integer =
+    encode_MaxIOp!(cb, type, acc, elem)
+encode_reduce_body(cb, type, acc, elem, ::Val{:mul}, ::Type{T}) where T <: Integer =
+    encode_MulIOp!(cb, type, acc, elem)
+encode_reduce_body(cb, type, acc, elem, ::Val{:min}, ::Type{T}) where T <: Integer =
+    encode_MinIOp!(cb, type, acc, elem)
+
+
+# less likely commutative/associative ops can be reduced too for whatever reason.
+# eg: and, or, xor.
+encode_reduce_body(cb, type, acc, elem, ::Val{:and}, ::Type{T}) where T <: Integer =
+    encode_AndIOp!(cb, type, acc, elem)
+encode_reduce_body(cb, type, acc, elem, ::Val{:or}, ::Type{T}) where T <: Integer =
+    encode_OrIOp!(cb, type, acc, elem)
+encode_reduce_body(cb, type, acc, elem, ::Val{:xor}, ::Type{T}) where T <: Integer =
+    encode_XOrIOp!(cb, type, acc, elem)
 
 # cuda_tile.reshape
 @eval Intrinsics begin

From 5d3950018f143824fa71211fd8f6e94f26c0fda3 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Sun, 11 Jan 2026 19:00:04 +0530
Subject: [PATCH 02/31] operations: add axis(i) helper for 1-based to 0-based
 axis conversion

- Add axis(i::Integer) -> Val{i-1} convenience function
- Use instead of raw Val for self-documenting axis selection
---
 src/language/operations.jl | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/language/operations.jl b/src/language/operations.jl
index bf20bb2..bd67afc 100644
--- a/src/language/operations.jl
+++ b/src/language/operations.jl
@@ -59,6 +59,24 @@ Axis is 1-indexed. Equivalent to cld(arr.sizes[axis], shape[axis]).
     Intrinsics.get_index_space_shape(pv, axis - One())  # convert to 0-indexed
 end
 
+"""
+    axis(i::Integer) -> Val{i-1}
+
+Return a compile-time axis selector for tile operations.
+Axis indices are 1-based (axis(1) = first dimension, axis(2) = second, etc.).
+Internally converts to 0-based for Tile IR.
+
+Use this instead of raw `Val` for self-documenting code.
+
+# Examples
+```julia
+ct.cumsum(tile, ct.axis(1))   # Scan along first axis
+ct.cumsum(tile, ct.axis(2))   # Scan along second axis
+ct.scan(tile, ct.axis(1), :add)
+```
+"""
+@inline axis(i::Integer) = Val(i - One())
+
 """
     load(arr::TileArray, index, shape; padding_mode=PaddingMode.Undetermined) -> Tile
 

From 554d2b0d12cc936ef0c3bfaa3285214b08a533e7 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Sun, 11 Jan 2026 19:03:44 +0530
Subject: [PATCH 03/31] make axis public

---
 src/language/operations.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/language/operations.jl b/src/language/operations.jl
index bd67afc..69ef444 100644
--- a/src/language/operations.jl
+++ b/src/language/operations.jl
@@ -10,7 +10,7 @@
  Load/Store
 =============================================================================#
 
-public bid, num_blocks, num_tiles, load, store, gather, scatter
+public bid, num_blocks, num_tiles, axis, load, store, gather, scatter
 
 """
 Padding mode for load operations.

From a10c08604434cd98c5c21cba906f5bf6829fa7ab Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Sun, 11 Jan 2026 19:18:26 +0530
Subject: [PATCH 04/31] make  new reduce_{ops} public

---
 src/language/operations.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/language/operations.jl b/src/language/operations.jl
index 69ef444..19858f7 100644
--- a/src/language/operations.jl
+++ b/src/language/operations.jl
@@ -491,7 +491,7 @@ result = ct.astype(acc, ct.TFloat32)  # Convert to TF32 for tensor cores
  Reduction
 =============================================================================#
 
-public reduce_sum, reduce_max
+public reduce_sum, reduce_max, reduce_mul, reduce_min, reduce_and, reduce_or, reduce_xor
 
 """
     reduce_sum(tile::Tile{T, S}, axis::Integer) -> Tile{T, reduced_shape}

From abd829961044b448168ed3f6b04dd8429928b606 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Sun, 11 Jan 2026 19:24:27 +0530
Subject: [PATCH 05/31] reduce ops update and axis convenience

axis convenience is a bit helper function for `Val`.

But I see reduce is already one-based. Not sure if we should go with it. It doesn't harm anything. its just a convenience.
---
 README.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/README.md b/README.md
index 1b381ba..45336a7 100644
--- a/README.md
+++ b/README.md
@@ -159,7 +159,12 @@ conservative token threading in the compiler (see https://github.com/JuliaGPU/cu
 | Operation | Description |
 |-----------|-------------|
 | `reduce_sum(tile, axis)` | Sum along axis |
+| `reduce_mul(tile, axis)` | Product along axis |
 | `reduce_max(tile, axis)` | Maximum along axis |
+| `reduce_min(tile, axis)` | Minimum along axis |
+| `reduce_and(tile, axis)` | Bitwise AND along axis (integer) |
+| `reduce_or(tile, axis)` | Bitwise OR along axis (integer) |
+| `reduce_xor(tile, axis)` | Bitwise XOR along axis (integer) |
 
 ### Math
 | Operation | Description |
@@ -274,6 +279,11 @@ ct.permute(tile, (3, 1, 2))
 
 This applies to `bid`, `num_blocks`, `permute`, `reshape`, dimension arguments, etc.
 
+### axis convenience
+
+| `axis(i)` | Convert 1-based axis to 0-based (helper) |
+
+
 ### `Val`-like constants
 
 CuTile.jl uses `ct.Constant{T}` to encode compile-time constant values in the type domain, similar to how `Val` works. An explicit `[]` is needed to extract the value at runtime:

From fc4833853a8ebe90be27e23bd98be1d88579d802 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Sun, 11 Jan 2026 19:58:00 +0530
Subject: [PATCH 06/31] reduce ops: add wrapper functions and correct identity
 values via dispatch

- Add wrapper functions in operations.jl for reduce_mul, reduce_min, reduce_and,
  reduce_or, reduce_xor with appropriate type constraints
- Refactor identity value selection to use dispatch instead of if-else chain
- Correct identity values:
  - add: 0.0
  - max: -Inf (float) or 0 (int)
  - mul: 1.0
  - min: +Inf (float) or typemax(Int64) (int)
  - and: 0 (interpreted as -1 bits by backend)
  - or: 0
  - xor: 0
---
 src/compiler/intrinsics/core.jl | 57 ++++++++++++++++++--
 src/language/operations.jl      | 93 +++++++++++++++++++++++++++++++++
 2 files changed, 145 insertions(+), 5 deletions(-)

diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index a35d945..7edb2d9 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -645,13 +645,11 @@ function emit_reduce!(ctx::CGCtx, args, reduce_fn::Symbol)
 
     # Output tile type
     output_tile_type = tile_type!(tt, dtype, output_shape)
-
     # Scalar type for reduction body (0D tile)
     scalar_tile_type = tile_type!(tt, dtype, Int[])
 
-    # Create identity value - use simple dtype (f32), not tile type
-    identity_val = reduce_fn == :add ? -0.0 : (reduce_fn == :max ? -Inf : 0.0)
-    identity = FloatIdentity(identity_val, dtype, elem_type)
+    # Create identity value via dispatch on reduction function and element type
+    identity = reduce_identity(Val(reduce_fn), dtype, elem_type)
 
     # Emit ReduceOp
     results = encode_ReduceOp!(cb, [output_tile_type], [input_tv.v], axis, [identity], [scalar_tile_type]) do block_args
@@ -664,7 +662,56 @@ function emit_reduce!(ctx::CGCtx, args, reduce_fn::Symbol)
     CGVal(results[1], output_tile_type, Tile{elem_type, Tuple(output_shape)}, output_shape)
 end
 
-# Dispatch helpers for reduce body operations - dispatch on Val{fn} and elem_type
+#=============================================================================
+ Reduce Identity Values via Dispatch
+=============================================================================#
+
+"""
+    reduce_identity(reduce_fn, dtype, elem_type) -> ReduceIdentity
+
+Return the identity value for a reduction operation.
+Identity must satisfy: identity ⊕ x = x for the reduction operation.
+"""
+# Addition identity: 0 + x = x
+reduce_identity(::Val{:add}, dtype, ::Type{T}) where T <: AbstractFloat =
+    FloatIdentity(0.0, dtype, T)
+reduce_identity(::Val{:add}, dtype, ::Type{T}) where T <: Integer =
+    FloatIdentity(0.0, dtype, T)
+
+# Maximum identity: max(-Inf, x) = x
+reduce_identity(::Val{:max}, dtype, ::Type{T}) where T <: AbstractFloat =
+    FloatIdentity(-Inf, dtype, T)
+reduce_identity(::Val{:max}, dtype, ::Type{T}) where T <: Integer =
+    FloatIdentity(0.0, dtype, T)  # For integers, use 0 as identity (max(0, x) = x)
+
+# Multiplication identity: 1 * x = x
+reduce_identity(::Val{:mul}, dtype, ::Type{T}) where T <: AbstractFloat =
+    FloatIdentity(1.0, dtype, T)
+reduce_identity(::Val{:mul}, dtype, ::Type{T}) where T <: Integer =
+    FloatIdentity(1.0, dtype, T)
+
+# Minimum identity: min(+Inf, x) = x
+reduce_identity(::Val{:min}, dtype, ::Type{T}) where T <: AbstractFloat =
+    FloatIdentity(+Inf, dtype, T)
+reduce_identity(::Val{:min}, dtype, ::Type{T}) where T <: Integer =
+    FloatIdentity(typemax(Int64), dtype, T)  # Use max int as +Inf proxy
+
+# AND identity: all bits set (x & -1 == x)
+reduce_identity(::Val{:and}, dtype, ::Type{T}) where T <: Integer =
+    FloatIdentity(0.0, dtype, T)  # Will be interpreted as -1 bits by backend
+
+# OR identity: 0 | x = x
+reduce_identity(::Val{:or}, dtype, ::Type{T}) where T <: Integer =
+    FloatIdentity(0.0, dtype, T)
+
+# XOR identity: 0 ⊕ x = x
+reduce_identity(::Val{:xor}, dtype, ::Type{T}) where T <: Integer =
+    FloatIdentity(0.0, dtype, T)
+
+#=============================================================================
+ Reduce Body Operations - dispatch on Val{fn} and elem_type
+=============================================================================#
+
 encode_reduce_body(cb, type, acc, elem, ::Val{:add}, ::Type{T}) where T <: AbstractFloat =
     encode_AddFOp!(cb, type, acc, elem)
 encode_reduce_body(cb, type, acc, elem, ::Val{:max}, ::Type{T}) where T <: AbstractFloat =
diff --git a/src/language/operations.jl b/src/language/operations.jl
index 19858f7..db1356d 100644
--- a/src/language/operations.jl
+++ b/src/language/operations.jl
@@ -529,6 +529,99 @@ end
     Intrinsics.reduce_max(tile, Val(axis - 1))
 end
 
+"""
+    reduce_mul(tile::Tile{T, S}, axis::Integer) -> Tile{T, reduced_shape}
+
+Product reduction along the specified axis (1-indexed).
+Returns a tile with the specified dimension removed.
+
+# Example
+```julia
+# For a (128, 64) tile, reducing along axis 2:
+products = ct.reduce_mul(tile, 2)  # Returns (128,) tile
+```
+"""
+@inline function reduce_mul(tile::Tile{T, S}, axis::Integer) where {T, S}
+    Intrinsics.reduce_mul(tile, Val(axis - 1))
+end
+@inline function reduce_mul(tile::Tile{T, S}, ::Val{axis}) where {T, S, axis}
+    Intrinsics.reduce_mul(tile, Val(axis - 1))
+end
+
+"""
+    reduce_min(tile::Tile{T, S}, axis::Integer) -> Tile{T, reduced_shape}
+
+Minimum reduction along the specified axis (1-indexed).
+
+# Example
+```julia
+mins = ct.reduce_min(tile, 2)  # Min along axis 2
+```
+"""
+@inline function reduce_min(tile::Tile{T, S}, axis::Integer) where {T, S}
+    Intrinsics.reduce_min(tile, Val(axis - 1))
+end
+@inline function reduce_min(tile::Tile{T, S}, ::Val{axis}) where {T, S, axis}
+    Intrinsics.reduce_min(tile, Val(axis - 1))
+end
+
+"""
+    reduce_and(tile::Tile{T, S}, axis::Integer) -> Tile{T, reduced_shape}
+
+Bitwise AND reduction along the specified axis (1-indexed).
+Integer types only.
+
+# Example
+```julia
+# For an Int32 tile, reducing along axis 2:
+result = ct.reduce_and(tile, 2)  # Returns (128,) tile of Int32
+```
+"""
+@inline function reduce_and(tile::Tile{T, S}, axis::Integer) where {T <: Integer, S}
+    Intrinsics.reduce_and(tile, Val(axis - 1))
+end
+@inline function reduce_and(tile::Tile{T, S}, ::Val{axis}) where {T <: Integer, S, axis}
+    Intrinsics.reduce_and(tile, Val(axis - 1))
+end
+
+"""
+    reduce_or(tile::Tile{T, S}, axis::Integer) -> Tile{T, reduced_shape}
+
+Bitwise OR reduction along the specified axis (1-indexed).
+Integer types only.
+
+# Example
+```julia
+# For an Int32 tile, reducing along axis 2:
+result = ct.reduce_or(tile, 2)  # Returns (128,) tile of Int32
+```
+"""
+@inline function reduce_or(tile::Tile{T, S}, axis::Integer) where {T <: Integer, S}
+    Intrinsics.reduce_or(tile, Val(axis - 1))
+end
+@inline function reduce_or(tile::Tile{T, S}, ::Val{axis}) where {T <: Integer, S, axis}
+    Intrinsics.reduce_or(tile, Val(axis - 1))
+end
+
+"""
+    reduce_xor(tile::Tile{T, S}, axis::Integer) -> Tile{T, reduced_shape}
+
+Bitwise XOR reduction along the specified axis (1-indexed).
+Integer types only.
+
+# Example
+```julia
+# For an Int32 tile, reducing along axis 2:
+result = ct.reduce_xor(tile, 2)  # Returns (128,) tile of Int32
+```
+"""
+@inline function reduce_xor(tile::Tile{T, S}, axis::Integer) where {T <: Integer, S}
+    Intrinsics.reduce_xor(tile, Val(axis - 1))
+end
+@inline function reduce_xor(tile::Tile{T, S}, ::Val{axis}) where {T <: Integer, S, axis}
+    Intrinsics.reduce_xor(tile, Val(axis - 1))
+end
+
 #=============================================================================
  Matmul
 =============================================================================#

From dd251584d7f7eeb1addd5bd49d08997c3780c906 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Sun, 11 Jan 2026 20:04:57 +0530
Subject: [PATCH 07/31] add IntIdentity type for integer reduce operations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add IntIdentity struct to bytecode/writer.jl for proper integer identity encoding
- Add encode_tagged_int! function for encoding integer identity attributes (tag 0x01)
- Dispatch encode_identity! on identity type for proper encoding
- Update reduce_identity to return IntIdentity for integer operations
- Import ReduceIdentity, FloatIdentity, IntIdentity in intrinsics.jl

Identity values now properly typed:
- Float operations → FloatIdentity
- Integer operations → IntIdentity
---
 src/bytecode/writer.jl          | 37 ++++++++++++++++++++++++++++++++-
 src/compiler/intrinsics.jl      |  1 +
 src/compiler/intrinsics/core.jl | 14 ++++++-------
 3 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index c7a2ac3..b21f431 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -251,6 +251,17 @@ struct FloatIdentity <: ReduceIdentity
     dtype::Type  # Float16, Float32, Float64, etc.
 end
 
+"""
+    IntIdentity(value, type_id, dtype)
+
+Integer identity value for reduce operations (and, or, xor).
+"""
+struct IntIdentity <: ReduceIdentity
+    value::Int64  # Store as signed Int64, will be reinterpreted as unsigned
+    type_id::TypeId
+    dtype::Type  # Int8, Int16, Int32, Int64, UInt8, etc.
+end
+
 """
     encode_tagged_float!(cb, identity::FloatIdentity)
 
@@ -267,6 +278,21 @@ function encode_tagged_float!(cb::CodeBuilder, identity::FloatIdentity)
     encode_signed_varint!(cb.buf, bits)
 end
 
+"""
+    encode_tagged_int!(cb, identity::IntIdentity)
+
+Encode a tagged integer attribute for reduce identity.
+Format: tag(Int=0x01) + typeid + ap_int(value)
+"""
+function encode_tagged_int!(cb::CodeBuilder, identity::IntIdentity)
+    # Tag for Int attribute
+    push!(cb.buf, 0x01)
+    # Type ID
+    encode_typeid!(cb.buf, identity.type_id)
+    # Value as signed varint
+    encode_signed_varint!(cb.buf, identity.value)
+end
+
 """
     float_to_bits(value, dtype)
 
@@ -305,14 +331,23 @@ end
     encode_identity_array!(cb, identities)
 
 Encode an array of reduce identity attributes.
+Dispatches on identity type to encode correctly.
 """
 function encode_identity_array!(cb::CodeBuilder, identities::Vector{<:ReduceIdentity})
     encode_varint!(cb.buf, length(identities))
     for identity in identities
-        encode_tagged_float!(cb, identity)
+        encode_identity!(cb, identity)
     end
 end
 
+"""
+    encode_identity!(cb, identity)
+
+Encode a single identity attribute, dispatching on type.
+"""
+encode_identity!(cb::CodeBuilder, identity::FloatIdentity) = encode_tagged_float!(cb, identity)
+encode_identity!(cb::CodeBuilder, identity::IntIdentity) = encode_tagged_int!(cb, identity)
+
 """
     BytecodeWriter
 
diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl
index 06a8f40..68fc795 100644
--- a/src/compiler/intrinsics.jl
+++ b/src/compiler/intrinsics.jl
@@ -8,6 +8,7 @@ using Base: compilerbarrier, donotdelete
 using ..cuTile: Tile, TileArray, Constant, TensorView, PartitionView
 using ..cuTile: Signedness, SignednessSigned, SignednessUnsigned
 using ..cuTile: ComparisonPredicate, CmpLessThan, CmpLessThanOrEqual, CmpGreaterThan, CmpGreaterThanOrEqual, CmpEqual, CmpNotEqual
+using ..cuTile: ReduceIdentity, FloatIdentity, IntIdentity
 
 end
 
diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index 7edb2d9..9ee5f26 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -676,37 +676,37 @@ Identity must satisfy: identity ⊕ x = x for the reduction operation.
 reduce_identity(::Val{:add}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentity(0.0, dtype, T)
 reduce_identity(::Val{:add}, dtype, ::Type{T}) where T <: Integer =
-    FloatIdentity(0.0, dtype, T)
+    IntIdentity(0, dtype, T)
 
 # Maximum identity: max(-Inf, x) = x
 reduce_identity(::Val{:max}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentity(-Inf, dtype, T)
 reduce_identity(::Val{:max}, dtype, ::Type{T}) where T <: Integer =
-    FloatIdentity(0.0, dtype, T)  # For integers, use 0 as identity (max(0, x) = x)
+    IntIdentity(0, dtype, T)  # For integers, use 0 as identity (max(0, x) = x)
 
 # Multiplication identity: 1 * x = x
 reduce_identity(::Val{:mul}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentity(1.0, dtype, T)
 reduce_identity(::Val{:mul}, dtype, ::Type{T}) where T <: Integer =
-    FloatIdentity(1.0, dtype, T)
+    IntIdentity(1, dtype, T)
 
 # Minimum identity: min(+Inf, x) = x
 reduce_identity(::Val{:min}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentity(+Inf, dtype, T)
 reduce_identity(::Val{:min}, dtype, ::Type{T}) where T <: Integer =
-    FloatIdentity(typemax(Int64), dtype, T)  # Use max int as +Inf proxy
+    IntIdentity(typemax(Int64), dtype, T)  # Use max int as +Inf proxy
 
 # AND identity: all bits set (x & -1 == x)
 reduce_identity(::Val{:and}, dtype, ::Type{T}) where T <: Integer =
-    FloatIdentity(0.0, dtype, T)  # Will be interpreted as -1 bits by backend
+    IntIdentity(-1, dtype, T)  # All bits set
 
 # OR identity: 0 | x = x
 reduce_identity(::Val{:or}, dtype, ::Type{T}) where T <: Integer =
-    FloatIdentity(0.0, dtype, T)
+    IntIdentity(0, dtype, T)
 
 # XOR identity: 0 ⊕ x = x
 reduce_identity(::Val{:xor}, dtype, ::Type{T}) where T <: Integer =
-    FloatIdentity(0.0, dtype, T)
+    IntIdentity(0, dtype, T)
 
 #=============================================================================
  Reduce Body Operations - dispatch on Val{fn} and elem_type

From 4401d37356e85dfcf8ec351f68a99dae1197ddff Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Sun, 11 Jan 2026 20:08:02 +0530
Subject: [PATCH 08/31] rename IntIdentity to IntegerIdentity for clarity

---
 src/bytecode/writer.jl          | 10 +++++-----
 src/compiler/intrinsics.jl      |  2 +-
 src/compiler/intrinsics/core.jl | 14 +++++++-------
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index b21f431..8a4c4e9 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -252,11 +252,11 @@ struct FloatIdentity <: ReduceIdentity
 end
 
 """
-    IntIdentity(value, type_id, dtype)
+    IntegerIdentity(value, type_id, dtype)
 
 Integer identity value for reduce operations (and, or, xor).
 """
-struct IntIdentity <: ReduceIdentity
+struct IntegerIdentity <: ReduceIdentity
     value::Int64  # Store as signed Int64, will be reinterpreted as unsigned
     type_id::TypeId
     dtype::Type  # Int8, Int16, Int32, Int64, UInt8, etc.
@@ -279,12 +279,12 @@ function encode_tagged_float!(cb::CodeBuilder, identity::FloatIdentity)
 end
 
 """
-    encode_tagged_int!(cb, identity::IntIdentity)
+    encode_tagged_int!(cb, identity::IntegerIdentity)
 
 Encode a tagged integer attribute for reduce identity.
 Format: tag(Int=0x01) + typeid + ap_int(value)
 """
-function encode_tagged_int!(cb::CodeBuilder, identity::IntIdentity)
+function encode_tagged_int!(cb::CodeBuilder, identity::IntegerIdentity)
     # Tag for Int attribute
     push!(cb.buf, 0x01)
     # Type ID
@@ -346,7 +346,7 @@ end
 Encode a single identity attribute, dispatching on type.
 """
 encode_identity!(cb::CodeBuilder, identity::FloatIdentity) = encode_tagged_float!(cb, identity)
-encode_identity!(cb::CodeBuilder, identity::IntIdentity) = encode_tagged_int!(cb, identity)
+encode_identity!(cb::CodeBuilder, identity::IntegerIdentity) = encode_tagged_int!(cb, identity)
 
 """
     BytecodeWriter
diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl
index 68fc795..b89c1c5 100644
--- a/src/compiler/intrinsics.jl
+++ b/src/compiler/intrinsics.jl
@@ -8,7 +8,7 @@ using Base: compilerbarrier, donotdelete
 using ..cuTile: Tile, TileArray, Constant, TensorView, PartitionView
 using ..cuTile: Signedness, SignednessSigned, SignednessUnsigned
 using ..cuTile: ComparisonPredicate, CmpLessThan, CmpLessThanOrEqual, CmpGreaterThan, CmpGreaterThanOrEqual, CmpEqual, CmpNotEqual
-using ..cuTile: ReduceIdentity, FloatIdentity, IntIdentity
+using ..cuTile: ReduceIdentity, FloatIdentity, IntegerIdentity
 
 end
 
diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index 9ee5f26..c78a2bc 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -676,37 +676,37 @@ Identity must satisfy: identity ⊕ x = x for the reduction operation.
 reduce_identity(::Val{:add}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentity(0.0, dtype, T)
 reduce_identity(::Val{:add}, dtype, ::Type{T}) where T <: Integer =
-    IntIdentity(0, dtype, T)
+    IntegerIdentity(0, dtype, T)
 
 # Maximum identity: max(-Inf, x) = x
 reduce_identity(::Val{:max}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentity(-Inf, dtype, T)
 reduce_identity(::Val{:max}, dtype, ::Type{T}) where T <: Integer =
-    IntIdentity(0, dtype, T)  # For integers, use 0 as identity (max(0, x) = x)
+    IntegerIdentity(0, dtype, T)  # For integers, use 0 as identity (max(0, x) = x)
 
 # Multiplication identity: 1 * x = x
 reduce_identity(::Val{:mul}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentity(1.0, dtype, T)
 reduce_identity(::Val{:mul}, dtype, ::Type{T}) where T <: Integer =
-    IntIdentity(1, dtype, T)
+    IntegerIdentity(1, dtype, T)
 
 # Minimum identity: min(+Inf, x) = x
 reduce_identity(::Val{:min}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentity(+Inf, dtype, T)
 reduce_identity(::Val{:min}, dtype, ::Type{T}) where T <: Integer =
-    IntIdentity(typemax(Int64), dtype, T)  # Use max int as +Inf proxy
+    IntegerIdentity(typemax(Int64), dtype, T)  # Use max int as +Inf proxy
 
 # AND identity: all bits set (x & -1 == x)
 reduce_identity(::Val{:and}, dtype, ::Type{T}) where T <: Integer =
-    IntIdentity(-1, dtype, T)  # All bits set
+    IntegerIdentity(-1, dtype, T)  # All bits set
 
 # OR identity: 0 | x = x
 reduce_identity(::Val{:or}, dtype, ::Type{T}) where T <: Integer =
-    IntIdentity(0, dtype, T)
+    IntegerIdentity(0, dtype, T)
 
 # XOR identity: 0 ⊕ x = x
 reduce_identity(::Val{:xor}, dtype, ::Type{T}) where T <: Integer =
-    IntIdentity(0, dtype, T)
+    IntegerIdentity(0, dtype, T)
 
 #=============================================================================
  Reduce Body Operations - dispatch on Val{fn} and elem_type

From a70f7b7c42c7cc7e2c22ba2313d23b0200de5865 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Sun, 11 Jan 2026 23:47:57 +0530
Subject: [PATCH 09/31] fix: remove AbstractFloat constraint from reduce_sum
 and reduce_max

The intrinsics were updated to support all types but the wrapper functions
in operations.jl still had T <: AbstractFloat constraint, causing method
lookup failures for integer types.
---
 src/language/operations.jl | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/language/operations.jl b/src/language/operations.jl
index db1356d..3dcafb3 100644
--- a/src/language/operations.jl
+++ b/src/language/operations.jl
@@ -499,16 +499,18 @@ public reduce_sum, reduce_max, reduce_mul, reduce_min, reduce_and, reduce_or, re
 Sum reduction along the specified axis (1-indexed).
 Returns a tile with the specified dimension removed.
 
+Supports any numeric type (Float16, Float32, Float64, and integer types).
+
 # Example
 ```julia
 # For a (128, 64) tile, reducing along axis 2:
 sums = ct.reduce_sum(tile, 2)  # Returns (128,) tile
 ```
 """
-@inline function reduce_sum(tile::Tile{T, S}, axis::Integer) where {T <: AbstractFloat, S}
+@inline function reduce_sum(tile::Tile{T, S}, axis::Integer) where {T, S}
     Intrinsics.reduce_sum(tile, Val(axis - 1))
 end
-@inline function reduce_sum(tile::Tile{T, S}, ::Val{axis}) where {T <: AbstractFloat, S, axis}
+@inline function reduce_sum(tile::Tile{T, S}, ::Val{axis}) where {T, S, axis}
     Intrinsics.reduce_sum(tile, Val(axis - 1))
 end
 
@@ -516,16 +518,17 @@ end
     reduce_max(tile::Tile{T, S}, axis::Integer) -> Tile{T, reduced_shape}
 
 Maximum reduction along the specified axis (1-indexed).
+Supports any numeric type (Float16, Float32, Float64, and integer types).
 
 # Example
 ```julia
 maxes = ct.reduce_max(tile, 2)  # Max along axis 2
 ```
 """
-@inline function reduce_max(tile::Tile{T, S}, axis::Integer) where {T <: AbstractFloat, S}
+@inline function reduce_max(tile::Tile{T, S}, axis::Integer) where {T, S}
     Intrinsics.reduce_max(tile, Val(axis - 1))
 end
-@inline function reduce_max(tile::Tile{T, S}, ::Val{axis}) where {T <: AbstractFloat, S, axis}
+@inline function reduce_max(tile::Tile{T, S}, ::Val{axis}) where {T, S, axis}
     Intrinsics.reduce_max(tile, Val(axis - 1))
 end
 

From bbda0f14cb128ee705c4d1da842d1cc46fb4195a Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Sun, 11 Jan 2026 23:49:20 +0530
Subject: [PATCH 10/31] use Number constraint for numeric reduce operations

- reduce_sum, reduce_max, reduce_mul, reduce_min now use T <: Number
- Provides type safety while supporting all numeric types
- More self-documenting than unconstrained T
---
 src/language/operations.jl | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/language/operations.jl b/src/language/operations.jl
index 3dcafb3..075ddb8 100644
--- a/src/language/operations.jl
+++ b/src/language/operations.jl
@@ -507,10 +507,10 @@ Supports any numeric type (Float16, Float32, Float64, and integer types).
 sums = ct.reduce_sum(tile, 2)  # Returns (128,) tile
 ```
 """
-@inline function reduce_sum(tile::Tile{T, S}, axis::Integer) where {T, S}
+@inline function reduce_sum(tile::Tile{T, S}, axis::Integer) where {T <: Number, S}
     Intrinsics.reduce_sum(tile, Val(axis - 1))
 end
-@inline function reduce_sum(tile::Tile{T, S}, ::Val{axis}) where {T, S, axis}
+@inline function reduce_sum(tile::Tile{T, S}, ::Val{axis}) where {T <: Number, S, axis}
     Intrinsics.reduce_sum(tile, Val(axis - 1))
 end
 
@@ -525,10 +525,10 @@ Supports any numeric type (Float16, Float32, Float64, and integer types).
 maxes = ct.reduce_max(tile, 2)  # Max along axis 2
 ```
 """
-@inline function reduce_max(tile::Tile{T, S}, axis::Integer) where {T, S}
+@inline function reduce_max(tile::Tile{T, S}, axis::Integer) where {T <: Number, S}
     Intrinsics.reduce_max(tile, Val(axis - 1))
 end
-@inline function reduce_max(tile::Tile{T, S}, ::Val{axis}) where {T, S, axis}
+@inline function reduce_max(tile::Tile{T, S}, ::Val{axis}) where {T <: Number, S, axis}
     Intrinsics.reduce_max(tile, Val(axis - 1))
 end
 
@@ -544,10 +544,10 @@ Returns a tile with the specified dimension removed.
 products = ct.reduce_mul(tile, 2)  # Returns (128,) tile
 ```
 """
-@inline function reduce_mul(tile::Tile{T, S}, axis::Integer) where {T, S}
+@inline function reduce_mul(tile::Tile{T, S}, axis::Integer) where {T <: Number, S}
     Intrinsics.reduce_mul(tile, Val(axis - 1))
 end
-@inline function reduce_mul(tile::Tile{T, S}, ::Val{axis}) where {T, S, axis}
+@inline function reduce_mul(tile::Tile{T, S}, ::Val{axis}) where {T <: Number, S, axis}
     Intrinsics.reduce_mul(tile, Val(axis - 1))
 end
 
@@ -561,10 +561,10 @@ Minimum reduction along the specified axis (1-indexed).
 mins = ct.reduce_min(tile, 2)  # Min along axis 2
 ```
 """
-@inline function reduce_min(tile::Tile{T, S}, axis::Integer) where {T, S}
+@inline function reduce_min(tile::Tile{T, S}, axis::Integer) where {T <: Number, S}
     Intrinsics.reduce_min(tile, Val(axis - 1))
 end
-@inline function reduce_min(tile::Tile{T, S}, ::Val{axis}) where {T, S, axis}
+@inline function reduce_min(tile::Tile{T, S}, ::Val{axis}) where {T <: Number, S, axis}
     Intrinsics.reduce_min(tile, Val(axis - 1))
 end
 

From 33d93ae869cdd4614ce72fa947224cf55007cc5b Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Sun, 11 Jan 2026 23:57:34 +0530
Subject: [PATCH 11/31] add signed field to IntegerIdentity for proper
 signed/unsigned encoding

- IntegerIdentity now has signed::Bool field
- encode_tagged_int! encodes signed with zigzag varint, unsigned with plain varint
- Add is_signed() helper that checks T <: SignedInteger
- Update all reduce_identity calls to pass is_signed(T)
---
 src/bytecode/writer.jl          | 19 ++++++++++++-------
 src/compiler/intrinsics/core.jl | 22 +++++++++++++++-------
 2 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index 8a4c4e9..9f015ec 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -252,14 +252,15 @@ struct FloatIdentity <: ReduceIdentity
 end
 
 """
-    IntegerIdentity(value, type_id, dtype)
+    IntegerIdentity(value, type_id, dtype, signed)
 
-Integer identity value for reduce operations (and, or, xor).
+Integer identity value for reduce operations (add, max, mul, min, and, or, xor).
 """
 struct IntegerIdentity <: ReduceIdentity
     value::Int64  # Store as signed Int64, will be reinterpreted as unsigned
     type_id::TypeId
-    dtype::Type  # Int8, Int16, Int32, Int64, UInt8, etc.
+    dtype::Type   # Int8, Int16, Int32, Int64, UInt8, etc.
+    signed::Bool  # true for signed, false for unsigned
 end
 
 """
@@ -279,18 +280,22 @@ function encode_tagged_float!(cb::CodeBuilder, identity::FloatIdentity)
 end
 
 """
-    encode_tagged_int!(cb, identity::IntegerIdentity)
+    encode_tagged_int!(cb, identity::IntegerIdentity; is_reduce::Bool=true)
 
 Encode a tagged integer attribute for reduce identity.
 Format: tag(Int=0x01) + typeid + ap_int(value)
 """
-function encode_tagged_int!(cb::CodeBuilder, identity::IntegerIdentity)
+function encode_tagged_int!(cb::CodeBuilder, identity::IntegerIdentity; is_reduce::Bool=true)
     # Tag for Int attribute
     push!(cb.buf, 0x01)
     # Type ID
     encode_typeid!(cb.buf, identity.type_id)
-    # Value as signed varint
-    encode_signed_varint!(cb.buf, identity.value)
+    # Value: signed uses zigzag varint, unsigned uses plain varint
+    if identity.signed
+        encode_signed_varint!(cb.buf, identity.value)
+    else
+        encode_varint!(cb.buf, UInt64(identity.value))
+    end
 end
 
 """
diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index c78a2bc..2793e0c 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -666,6 +666,14 @@ end
  Reduce Identity Values via Dispatch
 =============================================================================#
 
+"""
+    is_signed(::Type{T}) -> Bool
+
+Return true if type T is signed, false for unsigned types.
+"""
+is_signed(::Type{T}) where T <: Integer = T <: SignedInteger
+is_signed(::Type{T}) where T <: AbstractFloat = false
+
 """
     reduce_identity(reduce_fn, dtype, elem_type) -> ReduceIdentity
 
@@ -676,37 +684,37 @@ Identity must satisfy: identity ⊕ x = x for the reduction operation.
 reduce_identity(::Val{:add}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentity(0.0, dtype, T)
 reduce_identity(::Val{:add}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentity(0, dtype, T)
+    IntegerIdentity(0, dtype, T, is_signed(T))
 
 # Maximum identity: max(-Inf, x) = x
 reduce_identity(::Val{:max}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentity(-Inf, dtype, T)
 reduce_identity(::Val{:max}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentity(0, dtype, T)  # For integers, use 0 as identity (max(0, x) = x)
+    IntegerIdentity(0, dtype, T, is_signed(T))  # For integers, use 0 as identity (max(0, x) = x)
 
 # Multiplication identity: 1 * x = x
 reduce_identity(::Val{:mul}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentity(1.0, dtype, T)
 reduce_identity(::Val{:mul}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentity(1, dtype, T)
+    IntegerIdentity(1, dtype, T, is_signed(T))
 
 # Minimum identity: min(+Inf, x) = x
 reduce_identity(::Val{:min}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentity(+Inf, dtype, T)
 reduce_identity(::Val{:min}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentity(typemax(Int64), dtype, T)  # Use max int as +Inf proxy
+    IntegerIdentity(typemax(Int64), dtype, T, is_signed(T))  # Use max int as +Inf proxy
 
 # AND identity: all bits set (x & -1 == x)
 reduce_identity(::Val{:and}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentity(-1, dtype, T)  # All bits set
+    IntegerIdentity(-1, dtype, T, is_signed(T))  # All bits set
 
 # OR identity: 0 | x = x
 reduce_identity(::Val{:or}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentity(0, dtype, T)
+    IntegerIdentity(0, dtype, T, is_signed(T))
 
 # XOR identity: 0 ⊕ x = x
 reduce_identity(::Val{:xor}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentity(0, dtype, T)
+    IntegerIdentity(0, dtype, T, is_signed(T))
 
 #=============================================================================
  Reduce Body Operations - dispatch on Val{fn} and elem_type

From 175fc9a93f000293aad4b485cfe61bfe2f4e9c3b Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 00:02:23 +0530
Subject: [PATCH 12/31] remove unused is_reduce kwarg from encode_tagged_int

---
 src/bytecode/writer.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index 9f015ec..a53f196 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -280,12 +280,12 @@ function encode_tagged_float!(cb::CodeBuilder, identity::FloatIdentity)
 end
 
 """
-    encode_tagged_int!(cb, identity::IntegerIdentity; is_reduce::Bool=true)
+    encode_tagged_int!(cb, identity::IntegerIdentity)
 
-Encode a tagged integer attribute for reduce identity.
+Encode a tagged integer identity attribute.
 Format: tag(Int=0x01) + typeid + ap_int(value)
 """
-function encode_tagged_int!(cb::CodeBuilder, identity::IntegerIdentity; is_reduce::Bool=true)
+function encode_tagged_int!(cb::CodeBuilder, identity::IntegerIdentity)
     # Tag for Int attribute
     push!(cb.buf, 0x01)
     # Type ID

From b60a3271c1884cd6e7f1fb3324aeaaa5869ffb62 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 00:20:52 +0530
Subject: [PATCH 13/31] rename ReduceIdentity to OperationIdentity

- Abstract type now called OperationIdentity to reflect use by both reduce and scan operations
- FloatIdentity and IntegerIdentity now inherit from OperationIdentity
- Updated comments and docs to reflect the broader scope
- Updated import in intrinsics.jl
---
 src/bytecode/writer.jl     | 18 +++++++++---------
 src/compiler/intrinsics.jl |  2 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index a53f196..0ef181c 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -234,18 +234,18 @@ end
 =============================================================================#
 
 """
-    ReduceIdentity
+    OperationIdentity
 
-Abstract type for reduce identity attributes.
+Abstract type for binary operation identity attributes (reduce, scan, etc.).
 """
-abstract type ReduceIdentity end
+abstract type OperationIdentity end
 
 """
     FloatIdentity(value, type_id, dtype)
 
-Float identity value for reduce operations.
+Float identity value for binary operations.
 """
-struct FloatIdentity <: ReduceIdentity
+struct FloatIdentity <: OperationIdentity
     value::Float64
     type_id::TypeId
     dtype::Type  # Float16, Float32, Float64, etc.
@@ -254,9 +254,9 @@ end
 """
     IntegerIdentity(value, type_id, dtype, signed)
 
-Integer identity value for reduce operations (add, max, mul, min, and, or, xor).
+Integer identity value for binary operations.
 """
-struct IntegerIdentity <: ReduceIdentity
+struct IntegerIdentity <: OperationIdentity
     value::Int64  # Store as signed Int64, will be reinterpreted as unsigned
     type_id::TypeId
     dtype::Type   # Int8, Int16, Int32, Int64, UInt8, etc.
@@ -335,10 +335,10 @@ end
 """
     encode_identity_array!(cb, identities)
 
-Encode an array of reduce identity attributes.
+Encode an array of binary operation identity attributes.
 Dispatches on identity type to encode correctly.
 """
-function encode_identity_array!(cb::CodeBuilder, identities::Vector{<:ReduceIdentity})
+function encode_identity_array!(cb::CodeBuilder, identities::Vector{<:OperationIdentity})
     encode_varint!(cb.buf, length(identities))
     for identity in identities
         encode_identity!(cb, identity)
diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl
index b89c1c5..1c8df86 100644
--- a/src/compiler/intrinsics.jl
+++ b/src/compiler/intrinsics.jl
@@ -8,7 +8,7 @@ using Base: compilerbarrier, donotdelete
 using ..cuTile: Tile, TileArray, Constant, TensorView, PartitionView
 using ..cuTile: Signedness, SignednessSigned, SignednessUnsigned
 using ..cuTile: ComparisonPredicate, CmpLessThan, CmpLessThanOrEqual, CmpGreaterThan, CmpGreaterThanOrEqual, CmpEqual, CmpNotEqual
-using ..cuTile: ReduceIdentity, FloatIdentity, IntegerIdentity
+using ..cuTile: OperationIdentity, FloatIdentity, IntegerIdentity
 
 end
 

From 28a8875af3e78ed3b61b3ff9a0618b0ed05b093b Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 00:29:54 +0530
Subject: [PATCH 14/31] rename identity types to IdentityOp hierarchy

- IdentityOp: abstract type for binary operation identities
- FloatIdentityOp: concrete type for float identities
- IntegerIdentityOp: concrete type for integer identities (with signed field)
- Applied consistently across writer.jl, encodings.jl, intrinsics.jl, and core.jl
---
 src/bytecode/encodings.jl       |  2 +-
 src/bytecode/writer.jl          | 26 +++++++++++++-------------
 src/compiler/intrinsics.jl      |  2 +-
 src/compiler/intrinsics/core.jl | 28 ++++++++++++++--------------
 4 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/src/bytecode/encodings.jl b/src/bytecode/encodings.jl
index 9305bee..1c06dbb 100644
--- a/src/bytecode/encodings.jl
+++ b/src/bytecode/encodings.jl
@@ -1291,7 +1291,7 @@ function encode_ReduceOp!(body::Function, cb::CodeBuilder,
                           result_types::Vector{TypeId},
                           operands::Vector{Value},
                           dim::Int,
-                          identities::Vector{<:ReduceIdentity},
+                          identities::Vector{<:IdentityOp},
                           body_scalar_types::Vector{TypeId})
     encode_varint!(cb.buf, Opcode.ReduceOp)
 
diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index 0ef181c..33ff20a 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -234,29 +234,29 @@ end
 =============================================================================#
 
 """
-    OperationIdentity
+    IdentityOp
 
 Abstract type for binary operation identity attributes (reduce, scan, etc.).
 """
-abstract type OperationIdentity end
+abstract type IdentityOp end
 
 """
-    FloatIdentity(value, type_id, dtype)
+    FloatIdentityOp(value, type_id, dtype)
 
 Float identity value for binary operations.
 """
-struct FloatIdentity <: OperationIdentity
+struct FloatIdentityOp <: IdentityOp
     value::Float64
     type_id::TypeId
     dtype::Type  # Float16, Float32, Float64, etc.
 end
 
 """
-    IntegerIdentity(value, type_id, dtype, signed)
+    IntegerIdentityOp(value, type_id, dtype, signed)
 
 Integer identity value for binary operations.
 """
-struct IntegerIdentity <: OperationIdentity
+struct IntegerIdentityOp <: IdentityOp
     value::Int64  # Store as signed Int64, will be reinterpreted as unsigned
     type_id::TypeId
     dtype::Type   # Int8, Int16, Int32, Int64, UInt8, etc.
@@ -264,12 +264,12 @@ struct IntegerIdentity <: OperationIdentity
 end
 
 """
-    encode_tagged_float!(cb, identity::FloatIdentity)
+    encode_tagged_float!(cb, identity::FloatIdentityOp)
 
 Encode a tagged float attribute for reduce identity.
 Format: tag(Float=0x02) + typeid + ap_int(value_bits)
 """
-function encode_tagged_float!(cb::CodeBuilder, identity::FloatIdentity)
+function encode_tagged_float!(cb::CodeBuilder, identity::FloatIdentityOp)
     # Tag for Float attribute
     push!(cb.buf, 0x02)
     # Type ID
@@ -280,12 +280,12 @@ function encode_tagged_float!(cb::CodeBuilder, identity::FloatIdentity)
 end
 
 """
-    encode_tagged_int!(cb, identity::IntegerIdentity)
+    encode_tagged_int!(cb, identity::IntegerIdentityOp)
 
 Encode a tagged integer identity attribute.
 Format: tag(Int=0x01) + typeid + ap_int(value)
 """
-function encode_tagged_int!(cb::CodeBuilder, identity::IntegerIdentity)
+function encode_tagged_int!(cb::CodeBuilder, identity::IntegerIdentityOp)
     # Tag for Int attribute
     push!(cb.buf, 0x01)
     # Type ID
@@ -338,7 +338,7 @@ end
 Encode an array of binary operation identity attributes.
 Dispatches on identity type to encode correctly.
 """
-function encode_identity_array!(cb::CodeBuilder, identities::Vector{<:OperationIdentity})
+function encode_identity_array!(cb::CodeBuilder, identities::Vector{<:IdentityOp})
     encode_varint!(cb.buf, length(identities))
     for identity in identities
         encode_identity!(cb, identity)
@@ -350,8 +350,8 @@ end
 
 Encode a single identity attribute, dispatching on type.
 """
-encode_identity!(cb::CodeBuilder, identity::FloatIdentity) = encode_tagged_float!(cb, identity)
-encode_identity!(cb::CodeBuilder, identity::IntegerIdentity) = encode_tagged_int!(cb, identity)
+encode_identity!(cb::CodeBuilder, identity::FloatIdentityOp) = encode_tagged_float!(cb, identity)
+encode_identity!(cb::CodeBuilder, identity::IntegerIdentityOp) = encode_tagged_int!(cb, identity)
 
 """
     BytecodeWriter
diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl
index 1c8df86..66791bd 100644
--- a/src/compiler/intrinsics.jl
+++ b/src/compiler/intrinsics.jl
@@ -8,7 +8,7 @@ using Base: compilerbarrier, donotdelete
 using ..cuTile: Tile, TileArray, Constant, TensorView, PartitionView
 using ..cuTile: Signedness, SignednessSigned, SignednessUnsigned
 using ..cuTile: ComparisonPredicate, CmpLessThan, CmpLessThanOrEqual, CmpGreaterThan, CmpGreaterThanOrEqual, CmpEqual, CmpNotEqual
-using ..cuTile: OperationIdentity, FloatIdentity, IntegerIdentity
+using ..cuTile: IdentityOp, FloatIdentityOp, IntegerIdentityOp
 
 end
 
diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index 2793e0c..10ef1ac 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -675,46 +675,46 @@ is_signed(::Type{T}) where T <: Integer = T <: SignedInteger
 is_signed(::Type{T}) where T <: AbstractFloat = false
 
 """
-    reduce_identity(reduce_fn, dtype, elem_type) -> ReduceIdentity
+    reduce_identity(reduce_fn, dtype, elem_type) -> IdentityOp
 
-Return the identity value for a reduction operation.
-Identity must satisfy: identity ⊕ x = x for the reduction operation.
+Return the identity value for a binary operation (reduce, scan, etc.).
+Identity must satisfy: identity ⊕ x = x for the operation.
 """
 # Addition identity: 0 + x = x
 reduce_identity(::Val{:add}, dtype, ::Type{T}) where T <: AbstractFloat =
-    FloatIdentity(0.0, dtype, T)
+    FloatIdentityOp(0.0, dtype, T)
 reduce_identity(::Val{:add}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentity(0, dtype, T, is_signed(T))
+    IntegerIdentityOp(0, dtype, T, is_signed(T))
 
 # Maximum identity: max(-Inf, x) = x
 reduce_identity(::Val{:max}, dtype, ::Type{T}) where T <: AbstractFloat =
-    FloatIdentity(-Inf, dtype, T)
+    FloatIdentityOp(-Inf, dtype, T)
 reduce_identity(::Val{:max}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentity(0, dtype, T, is_signed(T))  # For integers, use 0 as identity (max(0, x) = x)
+    IntegerIdentityOp(0, dtype, T, is_signed(T))  # For integers, use 0 as identity (max(0, x) = x)
 
 # Multiplication identity: 1 * x = x
 reduce_identity(::Val{:mul}, dtype, ::Type{T}) where T <: AbstractFloat =
-    FloatIdentity(1.0, dtype, T)
+    FloatIdentityOp(1.0, dtype, T)
 reduce_identity(::Val{:mul}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentity(1, dtype, T, is_signed(T))
+    IntegerIdentityOp(1, dtype, T, is_signed(T))
 
 # Minimum identity: min(+Inf, x) = x
 reduce_identity(::Val{:min}, dtype, ::Type{T}) where T <: AbstractFloat =
-    FloatIdentity(+Inf, dtype, T)
+    FloatIdentityOp(+Inf, dtype, T)
 reduce_identity(::Val{:min}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentity(typemax(Int64), dtype, T, is_signed(T))  # Use max int as +Inf proxy
+    IntegerIdentityOp(typemax(Int64), dtype, T, is_signed(T))  # Use max int as +Inf proxy
 
 # AND identity: all bits set (x & -1 == x)
 reduce_identity(::Val{:and}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentity(-1, dtype, T, is_signed(T))  # All bits set
+    IntegerIdentityOp(-1, dtype, T, is_signed(T))  # All bits set
 
 # OR identity: 0 | x = x
 reduce_identity(::Val{:or}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentity(0, dtype, T, is_signed(T))
+    IntegerIdentityOp(0, dtype, T, is_signed(T))
 
 # XOR identity: 0 ⊕ x = x
 reduce_identity(::Val{:xor}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentity(0, dtype, T, is_signed(T))
+    IntegerIdentityOp(0, dtype, T, is_signed(T))
 
 #=============================================================================
  Reduce Body Operations - dispatch on Val{fn} and elem_type

From 56f3376f0b42794442a470ba3a36726702c060b1 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 01:13:05 +0530
Subject: [PATCH 15/31] fix is_signed to use proper Julia type hierarchy check

- T <: Integer && !(T <: Unsigned) correctly identifies:
  - Int32, Int64, etc. as signed (true)
  - UInt32, UInt64, etc. as unsigned (false)
---
 src/compiler/intrinsics/core.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index 10ef1ac..059d7dc 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -671,7 +671,7 @@ end
 
 Return true if type T is signed, false for unsigned types.
 """
-is_signed(::Type{T}) where T <: Integer = T <: SignedInteger
+is_signed(::Type{T}) where T <: Integer = T <: Integer && !(T <: Unsigned)
 is_signed(::Type{T}) where T <: AbstractFloat = false
 
 """

From 0327a2e873d3639039a5f3b4c44b01eb59ada708 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 01:20:57 +0530
Subject: [PATCH 16/31] intrinsics: use -one(T) instead of -1 for signed AND
 identity

Ensures type-consistent encoding for Int8, Int16, etc.

intrinsics: use type-dependent identity values for reduce ops

- add: zero(T)
- max: typemin(T)
- mul: one(T)
- min: typemax(T)
- and: is_signed ? -1 : typemax(T) for proper bit representation
- or, xor: zero(T)

Fixes encoding error for UInt32 (9223372036854775807 does not fit in 32 bits)

Update core.jl

fix reduce_min identity to use typemax(T) instead of typemax(Int64)

- For UInt32, typemax(UInt32) = 4294967295 fits in 32 bits
- typemax(Int64) = 9223372036854775807 does not fit and caused encoding error
---
 src/compiler/intrinsics/core.jl | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index 059d7dc..85e4776 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -682,39 +682,41 @@ Identity must satisfy: identity ⊕ x = x for the operation.
 """
 # Addition identity: 0 + x = x
 reduce_identity(::Val{:add}, dtype, ::Type{T}) where T <: AbstractFloat =
-    FloatIdentityOp(0.0, dtype, T)
+    FloatIdentityOp(zero(T), dtype, T)
 reduce_identity(::Val{:add}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentityOp(0, dtype, T, is_signed(T))
+    IntegerIdentityOp(zero(T), dtype, T, is_signed(T))
 
-# Maximum identity: max(-Inf, x) = x
+# Maximum identity: max(typemin(T), x) = x
 reduce_identity(::Val{:max}, dtype, ::Type{T}) where T <: AbstractFloat =
-    FloatIdentityOp(-Inf, dtype, T)
+    FloatIdentityOp(typemin(T), dtype, T)
 reduce_identity(::Val{:max}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentityOp(0, dtype, T, is_signed(T))  # For integers, use 0 as identity (max(0, x) = x)
+    IntegerIdentityOp(typemin(T), dtype, T, is_signed(T))
 
 # Multiplication identity: 1 * x = x
 reduce_identity(::Val{:mul}, dtype, ::Type{T}) where T <: AbstractFloat =
-    FloatIdentityOp(1.0, dtype, T)
+    FloatIdentityOp(one(T), dtype, T)
 reduce_identity(::Val{:mul}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentityOp(1, dtype, T, is_signed(T))
+    IntegerIdentityOp(one(T), dtype, T, is_signed(T))
 
-# Minimum identity: min(+Inf, x) = x
+# Minimum identity: min(typemax(T), x) = x
 reduce_identity(::Val{:min}, dtype, ::Type{T}) where T <: AbstractFloat =
-    FloatIdentityOp(+Inf, dtype, T)
+    FloatIdentityOp(typemax(T), dtype, T)
 reduce_identity(::Val{:min}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentityOp(typemax(Int64), dtype, T, is_signed(T))  # Use max int as +Inf proxy
+    IntegerIdentityOp(typemax(T), dtype, T, is_signed(T))
 
-# AND identity: all bits set (x & -1 == x)
+# AND identity: all bits set (x & identity == x)
+# For signed: -one(T) has all bits set in two's complement
+# For unsigned: typemax(T) has all bits set
 reduce_identity(::Val{:and}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentityOp(-1, dtype, T, is_signed(T))  # All bits set
+    IntegerIdentityOp(is_signed(T) ? -one(T) : typemax(T), dtype, T, is_signed(T))
 
 # OR identity: 0 | x = x
 reduce_identity(::Val{:or}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentityOp(0, dtype, T, is_signed(T))
+    IntegerIdentityOp(zero(T), dtype, T, is_signed(T))
 
 # XOR identity: 0 ⊕ x = x
 reduce_identity(::Val{:xor}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentityOp(0, dtype, T, is_signed(T))
+    IntegerIdentityOp(zero(T), dtype, T, is_signed(T))
 
 #=============================================================================
  Reduce Body Operations - dispatch on Val{fn} and elem_type

From d1c9c0a849f9881342476fdabc517107d7b96bc6 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 02:22:31 +0530
Subject: [PATCH 17/31] test: restore codegen and types tests, fix reduce_ops
 reference

test: add comprehensive reduce operations tests

- Tests for all reduce ops: add, mul, min, max, and, or, xor
- Tests for Float32, Float64, Int32, UInt32, Int8 types
- Tests for axis 0 and axis 1 reductions
- Compares GPU results against CPU reference implementations
- Includes UInt32 and Int8 tests for identity encoding fix
---
 test/reduce_ops.jl | 749 +++++++++++++++++++++++++++++++++++++++++++++
 test/runtests.jl   |   2 +-
 2 files changed, 750 insertions(+), 1 deletion(-)
 create mode 100644 test/reduce_ops.jl

diff --git a/test/reduce_ops.jl b/test/reduce_ops.jl
new file mode 100644
index 0000000..762f37e
--- /dev/null
+++ b/test/reduce_ops.jl
@@ -0,0 +1,749 @@
+using cuTile
+import cuTile as ct
+using CUDA
+using Test
+
+@testset "reduce operations" begin
+
+#======================================================================
+CPU reference implementations
+======================================================================
+
+cpu_reduce_add(a::AbstractArray, dims::Integer) = sum(a, dims=dims)
+cpu_reduce_mul(a::AbstractArray, dims::Integer) = prod(a, dims=dims)
+cpu_reduce_max(a::AbstractArray, dims::Integer) = maximum(a, dims=dims)
+cpu_reduce_min(a::AbstractArray, dims::Integer) = minimum(a, dims=dims)
+
+cpu_reduce_and(a::AbstractArray{<:Integer}, dims::Integer) = reduce((x, y) -> x & y, a, dims=dims)
+cpu_reduce_or(a::AbstractArray{<:Integer}, dims::Integer) = reduce((x, y) -> x | y, a, dims=dims)
+cpu_reduce_xor(a::AbstractArray{<:Integer}, dims::Integer) = reduce((x, y) -> x ⊻ y, a, dims=dims)
+
+#======================================================================
+Float32 operations
+======================================================================
+
+@testset "Float32 reduce_add" begin
+    function reduce_add_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 128))
+        sums = ct.reduce_sum(tile, 2)
+        ct.store(b, pid, sums)
+        return
+    end
+
+    m, n = 64, 128
+    a = CUDA.rand(Float32, m, n)
+    b = CUDA.zeros(Float32, m)
+
+    ct.launch(reduce_add_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] ≈ cpu_reduce_add(a_cpu[i:i, :], 2)[1] rtol=1e-3
+    end
+end
+
+@testset "Float32 reduce_mul" begin
+    function reduce_mul_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 32))
+        products = ct.reduce_mul(tile, 2)
+        ct.store(b, pid, products)
+        return
+    end
+
+    m, n = 32, 64
+    a = CUDA.rand(Float32, m, n) .+ 0.1f0
+    b = CUDA.ones(Float32, m)
+
+    ct.launch(reduce_mul_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] ≈ cpu_reduce_mul(a_cpu[i:i, :], 2)[1] rtol=1e-2
+    end
+end
+
+@testset "Float32 reduce_max" begin
+    function reduce_max_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 128))
+        maxes = ct.reduce_max(tile, 2)
+        ct.store(b, pid, maxes)
+        return
+    end
+
+    m, n = 64, 128
+    a = CUDA.rand(Float32, m, n)
+    b = CUDA.zeros(Float32, m)
+
+    ct.launch(reduce_max_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] ≈ cpu_reduce_max(a_cpu[i:i, :], 2)[1] rtol=1e-5
+    end
+end
+
+@testset "Float32 reduce_min" begin
+    function reduce_min_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 128))
+        mins = ct.reduce_min(tile, 2)
+        ct.store(b, pid, mins)
+        return
+    end
+
+    m, n = 64, 128
+    a = CUDA.rand(Float32, m, n)
+    b = CUDA.zeros(Float32, m)
+
+    ct.launch(reduce_min_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] ≈ cpu_reduce_min(a_cpu[i:i, :], 2)[1] rtol=1e-5
+    end
+end
+
+#======================================================================
+Float64 operations
+======================================================================
+
+@testset "Float64 reduce_add" begin
+    function reduce_add_f64_kernel(a::ct.TileArray{Float64,2}, b::ct.TileArray{Float64,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 64))
+        sums = ct.reduce_sum(tile, 2)
+        ct.store(b, pid, sums)
+        return
+    end
+
+    m, n = 32, 64
+    a = CUDA.rand(Float64, m, n)
+    b = CUDA.zeros(Float64, m)
+
+    ct.launch(reduce_add_f64_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] ≈ cpu_reduce_add(a_cpu[i:i, :], 2)[1] rtol=1e-5
+    end
+end
+
+@testset "Float64 reduce_max" begin
+    function reduce_max_f64_kernel(a::ct.TileArray{Float64,2}, b::ct.TileArray{Float64,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 64))
+        maxes = ct.reduce_max(tile, 2)
+        ct.store(b, pid, maxes)
+        return
+    end
+
+    m, n = 32, 64
+    a = CUDA.rand(Float64, m, n)
+    b = CUDA.zeros(Float64, m)
+
+    ct.launch(reduce_max_f64_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] ≈ cpu_reduce_max(a_cpu[i:i, :], 2)[1] rtol=1e-5
+    end
+end
+
+@testset "Float64 reduce_min" begin
+    function reduce_min_f64_kernel(a::ct.TileArray{Float64,2}, b::ct.TileArray{Float64,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 64))
+        mins = ct.reduce_min(tile, 2)
+        ct.store(b, pid, mins)
+        return
+    end
+
+    m, n = 32, 64
+    a = CUDA.rand(Float64, m, n)
+    b = CUDA.zeros(Float64, m)
+
+    ct.launch(reduce_min_f64_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] ≈ cpu_reduce_min(a_cpu[i:i, :], 2)[1] rtol=1e-5
+    end
+end
+
+@testset "Float64 reduce_mul" begin
+    function reduce_mul_f64_kernel(a::ct.TileArray{Float64,2}, b::ct.TileArray{Float64,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 32))
+        products = ct.reduce_mul(tile, 2)
+        ct.store(b, pid, products)
+        return
+    end
+
+    m, n = 16, 32
+    a = CUDA.rand(Float64, m, n) .+ 0.1
+    b = CUDA.ones(Float64, m)
+
+    ct.launch(reduce_mul_f64_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] ≈ cpu_reduce_mul(a_cpu[i:i, :], 2)[1] rtol=1e-2
+    end
+end
+
+#======================================================================
+Int32 operations
+======================================================================
+
+@testset "Int32 reduce_add" begin
+    function reduce_add_i32_kernel(a::ct.TileArray{Int32,2}, b::ct.TileArray{Int32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 64))
+        sums = ct.reduce_sum(tile, 2)
+        ct.store(b, pid, sums)
+        return
+    end
+
+    m, n = 32, 64
+    a = CUDA.rand(Int32, m, n) .+ 1
+    b = CUDA.zeros(Int32, m)
+
+    ct.launch(reduce_add_i32_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_add(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "Int32 reduce_mul" begin
+    function reduce_mul_i32_kernel(a::ct.TileArray{Int32,2}, b::ct.TileArray{Int32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 16))
+        products = ct.reduce_mul(tile, 2)
+        ct.store(b, pid, products)
+        return
+    end
+
+    m, n = 8, 16
+    a = CUDA.rand(Int32, m, n) .% 10 .+ 2
+    b = CUDA.ones(Int32, m)
+
+    ct.launch(reduce_mul_i32_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_mul(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "Int32 reduce_max" begin
+    function reduce_max_i32_kernel(a::ct.TileArray{Int32,2}, b::ct.TileArray{Int32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 64))
+        maxes = ct.reduce_max(tile, 2)
+        ct.store(b, pid, maxes)
+        return
+    end
+
+    m, n = 32, 64
+    a = CUDA.rand(Int32, m, n)
+    b = CUDA.fill(typemin(Int32), m)
+
+    ct.launch(reduce_max_i32_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_max(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "Int32 reduce_min" begin
+    function reduce_min_i32_kernel(a::ct.TileArray{Int32,2}, b::ct.TileArray{Int32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 64))
+        mins = ct.reduce_min(tile, 2)
+        ct.store(b, pid, mins)
+        return
+    end
+
+    m, n = 32, 64
+    a = CUDA.rand(Int32, m, n)
+    b = CUDA.fill(typemax(Int32), m)
+
+    ct.launch(reduce_min_i32_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_min(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "Int32 reduce_and" begin
+    function reduce_and_i32_kernel(a::ct.TileArray{Int32,2}, b::ct.TileArray{Int32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 32))
+        result = ct.reduce_and(tile, 2)
+        ct.store(b, pid, result)
+        return
+    end
+
+    m, n = 16, 32
+    a = CUDA.rand(Int32, m, n)
+    b = CUDA.zeros(Int32, m)
+
+    ct.launch(reduce_and_i32_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_and(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "Int32 reduce_or" begin
+    function reduce_or_i32_kernel(a::ct.TileArray{Int32,2}, b::ct.TileArray{Int32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 32))
+        result = ct.reduce_or(tile, 2)
+        ct.store(b, pid, result)
+        return
+    end
+
+    m, n = 16, 32
+    a = CUDA.rand(Int32, m, n)
+    b = CUDA.zeros(Int32, m)
+
+    ct.launch(reduce_or_i32_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_or(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "Int32 reduce_xor" begin
+    function reduce_xor_i32_kernel(a::ct.TileArray{Int32,2}, b::ct.TileArray{Int32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 32))
+        result = ct.reduce_xor(tile, 2)
+        ct.store(b, pid, result)
+        return
+    end
+
+    m, n = 16, 32
+    a = CUDA.rand(Int32, m, n)
+    b = CUDA.zeros(Int32, m)
+
+    ct.launch(reduce_xor_i32_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_xor(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+#======================================================================
+UInt32 operations - tests AND identity encoding fix
+======================================================================
+
+@testset "UInt32 reduce_add" begin
+    function reduce_add_u32_kernel(a::ct.TileArray{UInt32,2}, b::ct.TileArray{UInt32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 64))
+        sums = ct.reduce_sum(tile, 2)
+        ct.store(b, pid, sums)
+        return
+    end
+
+    m, n = 32, 64
+    a = CUDA.rand(UInt32, m, n)
+    b = CUDA.zeros(UInt32, m)
+
+    ct.launch(reduce_add_u32_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_add(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "UInt32 reduce_mul" begin
+    function reduce_mul_u32_kernel(a::ct.TileArray{UInt32,2}, b::ct.TileArray{UInt32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 16))
+        products = ct.reduce_mul(tile, 2)
+        ct.store(b, pid, products)
+        return
+    end
+
+    m, n = 8, 16
+    a = CUDA.rand(UInt32, m, n) .% 10 .+ 2
+    b = CUDA.ones(UInt32, m)
+
+    ct.launch(reduce_mul_u32_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_mul(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "UInt32 reduce_max" begin
+    function reduce_max_u32_kernel(a::ct.TileArray{UInt32,2}, b::ct.TileArray{UInt32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 64))
+        maxes = ct.reduce_max(tile, 2)
+        ct.store(b, pid, maxes)
+        return
+    end
+
+    m, n = 32, 64
+    a = CUDA.rand(UInt32, m, n)
+    b = CUDA.zeros(UInt32, m)
+
+    ct.launch(reduce_max_u32_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_max(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "UInt32 reduce_min" begin
+    function reduce_min_u32_kernel(a::ct.TileArray{UInt32,2}, b::ct.TileArray{UInt32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 64))
+        mins = ct.reduce_min(tile, 2)
+        ct.store(b, pid, mins)
+        return
+    end
+
+    m, n = 32, 64
+    a = CUDA.rand(UInt32, m, n)
+    b = CUDA.fill(typemax(UInt32), m)
+
+    ct.launch(reduce_min_u32_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_min(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "UInt32 reduce_and" begin
+    function reduce_and_u32_kernel(a::ct.TileArray{UInt32,2}, b::ct.TileArray{UInt32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 32))
+        result = ct.reduce_and(tile, 2)
+        ct.store(b, pid, result)
+        return
+    end
+
+    m, n = 16, 32
+    a = CUDA.rand(UInt32, m, n)
+    b = CUDA.zeros(UInt32, m)
+
+    ct.launch(reduce_and_u32_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_and(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "UInt32 reduce_or" begin
+    function reduce_or_u32_kernel(a::ct.TileArray{UInt32,2}, b::ct.TileArray{UInt32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 32))
+        result = ct.reduce_or(tile, 2)
+        ct.store(b, pid, result)
+        return
+    end
+
+    m, n = 16, 32
+    a = CUDA.rand(UInt32, m, n)
+    b = CUDA.zeros(UInt32, m)
+
+    ct.launch(reduce_or_u32_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_or(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "UInt32 reduce_xor" begin
+    function reduce_xor_u32_kernel(a::ct.TileArray{UInt32,2}, b::ct.TileArray{UInt32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 32))
+        result = ct.reduce_xor(tile, 2)
+        ct.store(b, pid, result)
+        return
+    end
+
+    m, n = 16, 32
+    a = CUDA.rand(UInt32, m, n)
+    b = CUDA.zeros(UInt32, m)
+
+    ct.launch(reduce_xor_u32_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_xor(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+#======================================================================
+Int8 operations - smaller integer type for encoding tests
+======================================================================
+
+@testset "Int8 reduce_add" begin
+    function reduce_add_i8_kernel(a::ct.TileArray{Int8,2}, b::ct.TileArray{Int8,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 32))
+        sums = ct.reduce_sum(tile, 2)
+        ct.store(b, pid, sums)
+        return
+    end
+
+    m, n = 16, 32
+    a = CUDA.rand(Int8, m, n)
+    b = CUDA.zeros(Int8, m)
+
+    ct.launch(reduce_add_i8_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test Int32(b_cpu[i]) == cpu_reduce_add(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "Int8 reduce_max" begin
+    function reduce_max_i8_kernel(a::ct.TileArray{Int8,2}, b::ct.TileArray{Int8,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 32))
+        maxes = ct.reduce_max(tile, 2)
+        ct.store(b, pid, maxes)
+        return
+    end
+
+    m, n = 16, 32
+    a = CUDA.rand(Int8, m, n)
+    b = CUDA.fill(typemin(Int8), m)
+
+    ct.launch(reduce_max_i8_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_max(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "Int8 reduce_min" begin
+    function reduce_min_i8_kernel(a::ct.TileArray{Int8,2}, b::ct.TileArray{Int8,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 32))
+        mins = ct.reduce_min(tile, 2)
+        ct.store(b, pid, mins)
+        return
+    end
+
+    m, n = 16, 32
+    a = CUDA.rand(Int8, m, n)
+    b = CUDA.fill(typemax(Int8), m)
+
+    ct.launch(reduce_min_i8_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_min(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "Int8 reduce_and" begin
+    function reduce_and_i8_kernel(a::ct.TileArray{Int8,2}, b::ct.TileArray{Int8,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 16))
+        result = ct.reduce_and(tile, 2)
+        ct.store(b, pid, result)
+        return
+    end
+
+    m, n = 8, 16
+    a = CUDA.rand(Int8, m, n)
+    b = CUDA.zeros(Int8, m)
+
+    ct.launch(reduce_and_i8_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test Int32(b_cpu[i]) == cpu_reduce_and(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "Int8 reduce_or" begin
+    function reduce_or_i8_kernel(a::ct.TileArray{Int8,2}, b::ct.TileArray{Int8,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 16))
+        result = ct.reduce_or(tile, 2)
+        ct.store(b, pid, result)
+        return
+    end
+
+    m, n = 8, 16
+    a = CUDA.rand(Int8, m, n)
+    b = CUDA.zeros(Int8, m)
+
+    ct.launch(reduce_or_i8_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test Int32(b_cpu[i]) == cpu_reduce_or(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "Int8 reduce_xor" begin
+    function reduce_xor_i8_kernel(a::ct.TileArray{Int8,2}, b::ct.TileArray{Int8,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 16))
+        result = ct.reduce_xor(tile, 2)
+        ct.store(b, pid, result)
+        return
+    end
+
+    m, n = 8, 16
+    a = CUDA.rand(Int8, m, n)
+    b = CUDA.zeros(Int8, m)
+
+    ct.launch(reduce_xor_i8_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test Int32(b_cpu[i]) == cpu_reduce_xor(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+#======================================================================
+Axis 0 reductions - verify both axes work
+======================================================================
+
+@testset "axis 0 reduce_sum Float32" begin
+    function reduce_sum_axis0_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (1, pid), (64, 1))
+        sums = ct.reduce_sum(tile, 1)
+        ct.store(b, pid, sums)
+        return
+    end
+
+    m, n = 64, 128
+    a = CUDA.rand(Float32, m, n)
+    b = CUDA.zeros(Float32, n)
+
+    ct.launch(reduce_sum_axis0_kernel, n, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for j in 1:n
+        @test b_cpu[j] ≈ cpu_reduce_add(a_cpu[:, j:j], 1)[1] rtol=1e-3
+    end
+end
+
+@testset "axis 0 reduce_min Int32" begin
+    function reduce_min_axis0_i32_kernel(a::ct.TileArray{Int32,2}, b::ct.TileArray{Int32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (1, pid), (32, 1))
+        mins = ct.reduce_min(tile, 1)
+        ct.store(b, pid, mins)
+        return
+    end
+
+    m, n = 32, 64
+    a = CUDA.rand(Int32, m, n)
+    b = CUDA.fill(typemax(Int32), n)
+
+    ct.launch(reduce_min_axis0_i32_kernel, n, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for j in 1:n
+        @test b_cpu[j] == cpu_reduce_min(a_cpu[:, j:j], 1)[1]
+    end
+end
+
+@testset "axis 0 reduce_max UInt32" begin
+    function reduce_max_axis0_u32_kernel(a::ct.TileArray{UInt32,2}, b::ct.TileArray{UInt32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (1, pid), (32, 1))
+        maxes = ct.reduce_max(tile, 1)
+        ct.store(b, pid, maxes)
+        return
+    end
+
+    m, n = 32, 64
+    a = CUDA.rand(UInt32, m, n)
+    b = CUDA.zeros(UInt32, n)
+
+    ct.launch(reduce_max_axis0_u32_kernel, n, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for j in 1:n
+        @test b_cpu[j] == cpu_reduce_max(a_cpu[:, j:j], 1)[1]
+    end
+end
+
+@testset "axis 0 reduce_and UInt32" begin
+    function reduce_and_axis0_u32_kernel(a::ct.TileArray{UInt32,2}, b::ct.TileArray{UInt32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (1, pid), (16, 1))
+        result = ct.reduce_and(tile, 1)
+        ct.store(b, pid, result)
+        return
+    end
+
+    m, n = 16, 32
+    a = CUDA.rand(UInt32, m, n)
+    b = CUDA.fill(typemax(UInt32), n)
+
+    ct.launch(reduce_and_axis0_u32_kernel, n, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for j in 1:n
+        @test b_cpu[j] == cpu_reduce_and(a_cpu[:, j:j], 1)[1]
+    end
+end
+
+end  # @testset "reduce operations"
diff --git a/test/runtests.jl b/test/runtests.jl
index 6b9aed9..a52163a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -45,7 +45,7 @@ if filter_tests!(testsuite, args)
 
     cuda_functional = CUDA.functional()
     filter!(testsuite) do (test, _)
-        if in(test, ["execution"]) || startswith(test, "examples/")
+        if in(test, ["execution", "reduce_ops"]) || startswith(test, "examples/")
             return cuda_functional
         else
             return true

From 2c3642ac30a0e2ac49cf090a1aeb4595ce9ccf18 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 02:33:52 +0530
Subject: [PATCH 18/31] multiline comment mess in reduce_ops.jl

used agent to create tests and hence the wrath.
---
 test/reduce_ops.jl | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/test/reduce_ops.jl b/test/reduce_ops.jl
index 762f37e..a352c69 100644
--- a/test/reduce_ops.jl
+++ b/test/reduce_ops.jl
@@ -5,9 +5,9 @@ using Test
 
 @testset "reduce operations" begin
 
-#======================================================================
-CPU reference implementations
-======================================================================
+#======================================================================#
+# CPU reference implementations
+# =====================================================================#
 
 cpu_reduce_add(a::AbstractArray, dims::Integer) = sum(a, dims=dims)
 cpu_reduce_mul(a::AbstractArray, dims::Integer) = prod(a, dims=dims)
@@ -18,9 +18,9 @@ cpu_reduce_and(a::AbstractArray{<:Integer}, dims::Integer) = reduce((x, y) -> x
 cpu_reduce_or(a::AbstractArray{<:Integer}, dims::Integer) = reduce((x, y) -> x | y, a, dims=dims)
 cpu_reduce_xor(a::AbstractArray{<:Integer}, dims::Integer) = reduce((x, y) -> x ⊻ y, a, dims=dims)
 
-#======================================================================
-Float32 operations
-======================================================================
+#======================================================================#
+# Float32 operations
+#======================================================================#
 
 @testset "Float32 reduce_add" begin
     function reduce_add_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1})
@@ -110,9 +110,9 @@ end
     end
 end
 
-#======================================================================
-Float64 operations
-======================================================================
+#======================================================================#
+# Float64 operations
+#======================================================================#
 
 @testset "Float64 reduce_add" begin
     function reduce_add_f64_kernel(a::ct.TileArray{Float64,2}, b::ct.TileArray{Float64,1})
@@ -202,9 +202,9 @@ end
     end
 end
 
-#======================================================================
-Int32 operations
-======================================================================
+#======================================================================#
+# Int32 operations
+#======================================================================#
 
 @testset "Int32 reduce_add" begin
     function reduce_add_i32_kernel(a::ct.TileArray{Int32,2}, b::ct.TileArray{Int32,1})
@@ -360,9 +360,9 @@ end
     end
 end
 
-#======================================================================
-UInt32 operations - tests AND identity encoding fix
-======================================================================
+#======================================================================#
+# UInt32 operations - tests AND identity encoding fix
+#======================================================================#
 
 @testset "UInt32 reduce_add" begin
     function reduce_add_u32_kernel(a::ct.TileArray{UInt32,2}, b::ct.TileArray{UInt32,1})
@@ -518,9 +518,9 @@ end
     end
 end
 
-#======================================================================
-Int8 operations - smaller integer type for encoding tests
-======================================================================
+#======================================================================#
+# Int8 operations - smaller integer type for encoding tests
+#======================================================================#
 
 @testset "Int8 reduce_add" begin
     function reduce_add_i8_kernel(a::ct.TileArray{Int8,2}, b::ct.TileArray{Int8,1})
@@ -654,9 +654,9 @@ end
     end
 end
 
-#======================================================================
-Axis 0 reductions - verify both axes work
-======================================================================
+#======================================================================#
+# Axis 0 reductions - verify both axes work
+#======================================================================#
 
 @testset "axis 0 reduce_sum Float32" begin
     function reduce_sum_axis0_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1})

From 212786c919ac31a239afe44bd8be6954b2c2c6f4 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 02:55:14 +0530
Subject: [PATCH 19/31] intrinsics: rename reduce_identity ->
 operation_identity

Prepares for reuse by scan operations. Function is shape-agnostic
and depends only on operation type and element type.
---
 src/compiler/intrinsics/core.jl | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index 85e4776..1991150 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -649,7 +649,7 @@ function emit_reduce!(ctx::CGCtx, args, reduce_fn::Symbol)
     scalar_tile_type = tile_type!(tt, dtype, Int[])
 
     # Create identity value via dispatch on reduction function and element type
-    identity = reduce_identity(Val(reduce_fn), dtype, elem_type)
+    identity = operation_identity(Val(reduce_fn), dtype, elem_type)
 
     # Emit ReduceOp
     results = encode_ReduceOp!(cb, [output_tile_type], [input_tv.v], axis, [identity], [scalar_tile_type]) do block_args
@@ -675,47 +675,47 @@ is_signed(::Type{T}) where T <: Integer = T <: Integer && !(T <: Unsigned)
 is_signed(::Type{T}) where T <: AbstractFloat = false
 
 """
-    reduce_identity(reduce_fn, dtype, elem_type) -> IdentityOp
+    operation_identity(fn, dtype, elem_type) -> IdentityOp
 
 Return the identity value for a binary operation (reduce, scan, etc.).
 Identity must satisfy: identity ⊕ x = x for the operation.
 """
 # Addition identity: 0 + x = x
-reduce_identity(::Val{:add}, dtype, ::Type{T}) where T <: AbstractFloat =
+operation_identity(::Val{:add}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentityOp(zero(T), dtype, T)
-reduce_identity(::Val{:add}, dtype, ::Type{T}) where T <: Integer =
+operation_identity(::Val{:add}, dtype, ::Type{T}) where T <: Integer =
     IntegerIdentityOp(zero(T), dtype, T, is_signed(T))
 
 # Maximum identity: max(typemin(T), x) = x
-reduce_identity(::Val{:max}, dtype, ::Type{T}) where T <: AbstractFloat =
+operation_identity(::Val{:max}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentityOp(typemin(T), dtype, T)
-reduce_identity(::Val{:max}, dtype, ::Type{T}) where T <: Integer =
+operation_identity(::Val{:max}, dtype, ::Type{T}) where T <: Integer =
     IntegerIdentityOp(typemin(T), dtype, T, is_signed(T))
 
 # Multiplication identity: 1 * x = x
-reduce_identity(::Val{:mul}, dtype, ::Type{T}) where T <: AbstractFloat =
+operation_identity(::Val{:mul}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentityOp(one(T), dtype, T)
-reduce_identity(::Val{:mul}, dtype, ::Type{T}) where T <: Integer =
+operation_identity(::Val{:mul}, dtype, ::Type{T}) where T <: Integer =
     IntegerIdentityOp(one(T), dtype, T, is_signed(T))
 
 # Minimum identity: min(typemax(T), x) = x
-reduce_identity(::Val{:min}, dtype, ::Type{T}) where T <: AbstractFloat =
+operation_identity(::Val{:min}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentityOp(typemax(T), dtype, T)
-reduce_identity(::Val{:min}, dtype, ::Type{T}) where T <: Integer =
+operation_identity(::Val{:min}, dtype, ::Type{T}) where T <: Integer =
     IntegerIdentityOp(typemax(T), dtype, T, is_signed(T))
 
 # AND identity: all bits set (x & identity == x)
 # For signed: -one(T) has all bits set in two's complement
 # For unsigned: typemax(T) has all bits set
-reduce_identity(::Val{:and}, dtype, ::Type{T}) where T <: Integer =
+operation_identity(::Val{:and}, dtype, ::Type{T}) where T <: Integer =
     IntegerIdentityOp(is_signed(T) ? -one(T) : typemax(T), dtype, T, is_signed(T))
 
 # OR identity: 0 | x = x
-reduce_identity(::Val{:or}, dtype, ::Type{T}) where T <: Integer =
+operation_identity(::Val{:or}, dtype, ::Type{T}) where T <: Integer =
     IntegerIdentityOp(zero(T), dtype, T, is_signed(T))
 
 # XOR identity: 0 ⊕ x = x
-reduce_identity(::Val{:xor}, dtype, ::Type{T}) where T <: Integer =
+operation_identity(::Val{:xor}, dtype, ::Type{T}) where T <: Integer =
     IntegerIdentityOp(zero(T), dtype, T, is_signed(T))
 
 #=============================================================================

From 5dfde9e7e83c9a061b8b6387a8a19b0ff3d02cc9 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 03:03:36 +0530
Subject: [PATCH 20/31] test: fix CPU reference functions for bitwise ops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Julia's reduce with dims= requires explicit init for &,|,⊻ operators.
Use typemax(T) for AND (identity with all bits set).
---
 test/reduce_ops.jl | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/test/reduce_ops.jl b/test/reduce_ops.jl
index a352c69..36dc9ee 100644
--- a/test/reduce_ops.jl
+++ b/test/reduce_ops.jl
@@ -14,9 +14,10 @@ cpu_reduce_mul(a::AbstractArray, dims::Integer) = prod(a, dims=dims)
 cpu_reduce_max(a::AbstractArray, dims::Integer) = maximum(a, dims=dims)
 cpu_reduce_min(a::AbstractArray, dims::Integer) = minimum(a, dims=dims)
 
-cpu_reduce_and(a::AbstractArray{<:Integer}, dims::Integer) = reduce((x, y) -> x & y, a, dims=dims)
-cpu_reduce_or(a::AbstractArray{<:Integer}, dims::Integer) = reduce((x, y) -> x | y, a, dims=dims)
-cpu_reduce_xor(a::AbstractArray{<:Integer}, dims::Integer) = reduce((x, y) -> x ⊻ y, a, dims=dims)
+cpu_reduce_and(a::AbstractArray{<:Unsigned}, dims::Integer) = reduce((x, y) -> x & y, a, init=typemax(eltype(a)), dims=dims)
+cpu_reduce_and(a::AbstractArray{<:Signed}, dims::Integer) = reduce((x, y) -> x & y, a, init=Int64(-1), dims=dims)
+cpu_reduce_or(a::AbstractArray{<:Integer}, dims::Integer) = reduce((x, y) -> x | y, a, init=zero(eltype(a)), dims=dims)
+cpu_reduce_xor(a::AbstractArray{<:Integer}, dims::Integer) = reduce((x, y) -> x ⊻ y, a, init=zero(eltype(a)), dims=dims)
 
 #======================================================================#
 # Float32 operations

From b256403a7a2b333055f9c17b7eeebf56dd8cab6e Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 03:26:30 +0530
Subject: [PATCH 21/31] bytecode: fix zigzag encoding for signed varint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The original code tried to convert Int64 directly to UInt64, which fails
for negative values like typemin(Int32) = -2147483648.

Zigzag encoding maps: (n << 1) ⊻ (n >> 63), enabling proper encoding of
negative integers in varint format.
---
 src/bytecode/writer.jl | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index 33ff20a..fbe5d83 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -327,8 +327,15 @@ end
 Encode a signed integer as a variable-length integer.
 Uses zigzag encoding for signed values.
 """
-function encode_signed_varint!(buf::Vector{UInt8}, value::Union{UInt16, UInt32, UInt64, Int64})
-    # For float bits, encode as unsigned varint
+function encode_signed_varint!(buf::Vector{UInt8}, value::Int64)
+    # Zigzag encoding: (n << 1) ⊻ (n >> 63)
+    # This maps negative values to positive odd numbers
+    encoded = (UInt64(value) << 1) ⊻ (UInt64(value) >> 63)
+    encode_varint!(buf, encoded)
+end
+
+function encode_signed_varint!(buf::Vector{UInt8}, value::Union{UInt16, UInt32, UInt64})
+    # For unsigned types, just encode as-is
     encode_varint!(buf, UInt64(value))
 end
 

From a50055f07835c8ea65ab35be6fb422c86d871d1e Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 04:26:09 +0530
Subject: [PATCH 22/31] reverting zigzag encoding

---
 src/bytecode/writer.jl | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index fbe5d83..8259116 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -327,14 +327,8 @@ end
 Encode a signed integer as a variable-length integer.
 Uses zigzag encoding for signed values.
 """
-function encode_signed_varint!(buf::Vector{UInt8}, value::Int64)
-    # Zigzag encoding: (n << 1) ⊻ (n >> 63)
-    # This maps negative values to positive odd numbers
-    encoded = (UInt64(value) << 1) ⊻ (UInt64(value) >> 63)
-    encode_varint!(buf, encoded)
-end
 
-function encode_signed_varint!(buf::Vector{UInt8}, value::Union{UInt16, UInt32, UInt64})
+function encode_signed_varint!(buf::Vector{UInt8}, value::Union{UInt16, UInt32, UInt64, Int64})
     # For unsigned types, just encode as-is
     encode_varint!(buf, UInt64(value))
 end

From 7ba27c430308fc46653bb0cf2829630671fefe93 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 04:56:01 +0530
Subject: [PATCH 23/31] bytecode: fix zigzag encoding for signed varint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Zigzag encoding: (n << 1) ⊻ (n >> 63) properly handles negative values
like typemin(Int32) = -2147483648.

Unsigned values use plain varint encoding since they don't need zigzag.
---
 src/bytecode/writer.jl | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index 8259116..fbe5d83 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -327,8 +327,14 @@ end
 Encode a signed integer as a variable-length integer.
 Uses zigzag encoding for signed values.
 """
+function encode_signed_varint!(buf::Vector{UInt8}, value::Int64)
+    # Zigzag encoding: (n << 1) ⊻ (n >> 63)
+    # This maps negative values to positive odd numbers
+    encoded = (UInt64(value) << 1) ⊻ (UInt64(value) >> 63)
+    encode_varint!(buf, encoded)
+end
 
-function encode_signed_varint!(buf::Vector{UInt8}, value::Union{UInt16, UInt32, UInt64, Int64})
+function encode_signed_varint!(buf::Vector{UInt8}, value::Union{UInt16, UInt32, UInt64})
     # For unsigned types, just encode as-is
     encode_varint!(buf, UInt64(value))
 end

From 95555c6fdfe865462ff3312a8cdd4d2ba9ad9569 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 04:58:05 +0530
Subject: [PATCH 24/31] bytecode: remove duplicate encode_signed_varint!

The correct implementation is in src/bytecode/basic.jl:
  function encode_signed_varint!(buf, x)
      x = x << 1
      if x < 0
          x = ~x
      end
      encode_varint!(buf, x)
  end

The duplicate in writer.jl was shadowing the correct one.
---
 src/bytecode/writer.jl | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index fbe5d83..88bd190 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -324,21 +324,6 @@ end
 """
     encode_signed_varint!(buf, value)
 
-Encode a signed integer as a variable-length integer.
-Uses zigzag encoding for signed values.
-"""
-function encode_signed_varint!(buf::Vector{UInt8}, value::Int64)
-    # Zigzag encoding: (n << 1) ⊻ (n >> 63)
-    # This maps negative values to positive odd numbers
-    encoded = (UInt64(value) << 1) ⊻ (UInt64(value) >> 63)
-    encode_varint!(buf, encoded)
-end
-
-function encode_signed_varint!(buf::Vector{UInt8}, value::Union{UInt16, UInt32, UInt64})
-    # For unsigned types, just encode as-is
-    encode_varint!(buf, UInt64(value))
-end
-
 """
     encode_identity_array!(cb, identities)
 

From 55efe4be2bcbe14af372256f910b8e2f76478a25 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 05:26:29 +0530
Subject: [PATCH 25/31] intrinsics: pass signedness to encode_MinIOp! and
 encode_MaxIOp!

For unsigned integer types like UInt16, UInt32, the comparison must use
unsigned signedness, not the default SignednessSigned.

This fixes wrong reduction results for unsigned types where signed comparison
was causing values to be interpreted incorrectly (e.g., 0xFFFF interpreted as -1).
---
 src/compiler/intrinsics/core.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index 1991150..9ad7782 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -733,11 +733,11 @@ encode_reduce_body(cb, type, acc, elem, ::Val{:min}, ::Type{T}) where T <: Abstr
 encode_reduce_body(cb, type, acc, elem, ::Val{:add}, ::Type{T}) where T <: Integer =
     encode_AddIOp!(cb, type, acc, elem)
 encode_reduce_body(cb, type, acc, elem, ::Val{:max}, ::Type{T}) where T <: Integer =
-    encode_MaxIOp!(cb, type, acc, elem)
+    encode_MaxIOp!(cb, type, acc, elem; signedness=is_signed(T) ? SignednessSigned : SignednessUnsigned)
 encode_reduce_body(cb, type, acc, elem, ::Val{:mul}, ::Type{T}) where T <: Integer =
     encode_MulIOp!(cb, type, acc, elem)
 encode_reduce_body(cb, type, acc, elem, ::Val{:min}, ::Type{T}) where T <: Integer =
-    encode_MinIOp!(cb, type, acc, elem)
+    encode_MinIOp!(cb, type, acc, elem; signedness=is_signed(T) ? SignednessSigned : SignednessUnsigned)
 
 
 # less likely commutative/associative ops can be reduced too for whatever reason.

From 77118eb579585baa1449e6571f171b208da224af Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 05:30:49 +0530
Subject: [PATCH 26/31] reverting original encode_signed_varint

---
 examples/reducekernel.jl | 20 ++++++++++++++++++++
 src/bytecode/writer.jl   |  9 +++++++++
 2 files changed, 29 insertions(+)
 create mode 100644 examples/reducekernel.jl

diff --git a/examples/reducekernel.jl b/examples/reducekernel.jl
new file mode 100644
index 0000000..7bb485b
--- /dev/null
+++ b/examples/reducekernel.jl
@@ -0,0 +1,20 @@
+using Test
+using CUDA
+using cuTile
+import cuTile as ct
+
+elType = UInt16
+function reduceKernel(a::ct.TileArray{elType,1}, b::ct.TileArray{elType,1}, tileSz::ct.Constant{Int})
+    bid = ct.bid(1)
+    tile = ct.load(a, bid, (tileSz[],))
+    result = ct.reduce_min(tile, Val(1))
+    ct.store(b, bid, result)
+    return nothing
+end
+
+sz = 32
+N = 2^15
+a = CUDA.rand(elType, N)
+b = CUDA.zeros(elType, cld(N, sz))
+CUDA.@sync ct.launch(reduceKernel, cld(length(a), sz), a, b, ct.Constant(sz))
+res = Array(b)
diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index 88bd190..8259116 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -324,6 +324,15 @@ end
 """
     encode_signed_varint!(buf, value)
 
+Encode a signed integer as a variable-length integer.
+Uses zigzag encoding for signed values.
+"""
+
+function encode_signed_varint!(buf::Vector{UInt8}, value::Union{UInt16, UInt32, UInt64, Int64})
+    # For unsigned types, just encode as-is
+    encode_varint!(buf, UInt64(value))
+end
+
 """
     encode_identity_array!(cb, identities)
 

From dfffe9886cf7083d4416dd5f0d6b19efdbcb06be Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 05:32:05 +0530
Subject: [PATCH 27/31] revert comment inside encode_signed_varint

---
 src/bytecode/writer.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index 8259116..bdb53f7 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -329,7 +329,7 @@ Uses zigzag encoding for signed values.
 """
 
 function encode_signed_varint!(buf::Vector{UInt8}, value::Union{UInt16, UInt32, UInt64, Int64})
-    # For unsigned types, just encode as-is
+    # For float bits, encode as unsigned varint
     encode_varint!(buf, UInt64(value))
 end
 

From eeb4d243f96777d5fed7708963f9aa67f464b0e1 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Sun, 11 Jan 2026 20:34:09 +0530
Subject: [PATCH 28/31] scanops related changes

---
 src/bytecode/encodings.jl       | 72 +++++++++++++++++++++++++
 src/bytecode/writer.jl          | 88 ++++++++++++++++++++++++++++++
 src/compiler/intrinsics/core.jl | 95 +++++++++++++++++++++++++++++++++
 src/language/operations.jl      | 21 +++++++-
 4 files changed, 275 insertions(+), 1 deletion(-)

diff --git a/src/bytecode/encodings.jl b/src/bytecode/encodings.jl
index 1c06dbb..650a5ed 100644
--- a/src/bytecode/encodings.jl
+++ b/src/bytecode/encodings.jl
@@ -1331,6 +1331,78 @@ function encode_ReduceOp!(body::Function, cb::CodeBuilder,
     end
 end
 
+
+#=============================================================================
+ Scan operations
+=============================================================================#
+
+"""
+    encode_ScanOp!(body::Function, cb::CodeBuilder,
+                   result_types::Vector{TypeId},
+                   operands::Vector{Value},
+                   dim::Int,
+                   reverse::Bool,
+                   identities::Vector{<:ScanIdentity},
+                   body_scalar_types::Vector{TypeId})
+
+Encode a ScanOp (parallel prefix sum) operation.
+
+# Arguments
+- body: Function that takes block args and yields result(s)
+- cb: CodeBuilder for the bytecode
+- result_types: Output tile types
+- operands: Input tiles to scan
+- dim: Dimension to scan along (0-indexed)
+- reverse: Whether to scan in reverse order
+- identities: Identity values for each operand
+- body_scalar_types: 0D tile types for body arguments
+"""
+function encode_ScanOp!(body::Function, cb::CodeBuilder,
+                        result_types::Vector{TypeId},
+                        operands::Vector{Value},
+                        dim::Int,
+                        reverse::Bool,
+                        identities::Vector{<:ScanIdentity},
+                        body_scalar_types::Vector{TypeId})
+    encode_varint!(cb.buf, Opcode.ScanOp)
+
+    # Variadic result types
+    encode_typeid_seq!(cb.buf, result_types)
+
+    # Attributes: dim (int), reverse (bool), identities (array)
+    encode_opattr_int!(cb, dim)
+    encode_opattr_bool!(cb, reverse)
+    encode_scan_identity_array!(cb, identities)
+
+    # Variadic operands
+    encode_varint!(cb.buf, length(operands))
+    encode_operands!(cb.buf, operands)
+
+    # Number of regions
+    push!(cb.debug_attrs, cb.cur_debug_attr)
+    cb.num_ops += 1
+    encode_varint!(cb.buf, 1)  # 1 region: body
+
+    # Body region - block args are pairs of (acc, elem) for each operand
+    # The body operates on 0D tiles (scalars)
+    body_arg_types = TypeId[]
+    for scalar_type in body_scalar_types
+        push!(body_arg_types, scalar_type)  # accumulator
+        push!(body_arg_types, scalar_type)  # element
+    end
+    with_region(body, cb, body_arg_types)
+
+    # Create result values
+    num_results = length(result_types)
+    if num_results == 0
+        return Value[]
+    else
+        vals = [Value(cb.next_value_id + i) for i in 0:num_results-1]
+        cb.next_value_id += num_results
+        return vals
+    end
+end
+
 #=============================================================================
  Comparison and selection operations
 =============================================================================#
diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index bdb53f7..8f5b89b 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -346,6 +346,94 @@ function encode_identity_array!(cb::CodeBuilder, identities::Vector{<:IdentityOp
     end
 end
 
+#=============================================================================
+ Tagged attributes for ScanOp identity values
+=============================================================================#
+
+"""
+    ScanIdentity
+
+Abstract type for scan identity attributes.
+"""
+abstract type ScanIdentity end
+
+"""
+    ScanFloatIdentity(value, type_id, dtype)
+
+Float identity value for scan operations.
+"""
+struct ScanFloatIdentity <: ScanIdentity
+    value::Float64
+    type_id::TypeId
+    dtype::Type  # Float16, Float32, Float64, etc.
+end
+
+"""
+    ScanIntegerIdentity(value, type_id, dtype)
+
+Integer identity value for scan operations.
+"""
+struct ScanIntegerIdentity <: ScanIdentity
+    value::Int64
+    type_id::TypeId
+    dtype::Type  # Int8, Int16, Int32, Int64
+    signed::Bool  # true for signed, false for unsigned
+end
+
+"""
+    encode_tagged_scan_identity!(cb, identity::ScanIdentity)
+
+Encode a scan identity attribute.
+"""
+function encode_tagged_scan_identity!(cb::CodeBuilder, identity::ScanIdentity)
+    if identity isa ScanFloatIdentity
+        encode_tagged_scan_float!(cb, identity)
+    elseif identity isa ScanIntegerIdentity
+        encode_tagged_scan_integer!(cb, identity)
+    else
+        error("Unsupported scan identity type: $(typeof(identity))")
+    end
+end
+
+"""
+    encode_tagged_scan_float!(cb, identity::ScanFloatIdentity)
+
+Encode a tagged float attribute for scan identity.
+"""
+function encode_tagged_scan_float!(cb::CodeBuilder, identity::ScanFloatIdentity)
+    push!(cb.buf, 0x02)  # Tag for Float attribute
+    encode_typeid!(cb.buf, identity.type_id)
+    bits = float_to_bits(identity.value, identity.dtype)
+    encode_signed_varint!(cb.buf, bits)
+end
+
+"""
+    encode_tagged_scan_integer!(cb, identity::ScanIntegerIdentity)
+
+Encode a tagged integer attribute for scan identity.
+"""
+function encode_tagged_scan_integer!(cb::CodeBuilder, identity::ScanIntegerIdentity)
+    push!(cb.buf, 0x01)  # Tag for Integer attribute
+    encode_typeid!(cb.buf, identity.type_id)
+    if identity.signed
+        encode_signed_varint!(cb.buf, identity.value)
+    else
+        encode_varint!(cb.buf, UInt64(identity.value))
+    end
+end
+
+"""
+    encode_scan_identity_array!(cb, identities)
+
+Encode an array of scan identity attributes.
+"""
+function encode_scan_identity_array!(cb::CodeBuilder, identities::Vector{<:ScanIdentity})
+    encode_varint!(cb.buf, length(identities))
+    for identity in identities
+        encode_tagged_scan_identity!(cb, identity)
+    end
+end
+
 """
     encode_identity!(cb, identity)
 
diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index 9ad7782..3d1f6a0 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -819,6 +819,101 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.reshape), args)
 end
 
 # TODO: cuda_tile.scan
+@eval Intrinsics begin
+    """
+        scan(tile, axis_val, fn_type; reverse=false)
+
+    Parallel prefix scan along specified dimension.
+    fn_type=:add for cumulative sum, :mul for cumulative product.
+    reverse=false for forward scan, true for reverse scan.
+    Compiled to cuda_tile.scan.
+    """
+    @noinline function scan(tile::Tile{T, S}, ::Val{axis}, fn::Symbol, reverse::Bool=false) where {T, S, axis}
+        # Scan preserves shape - result has same dimensions as input
+        Tile{T, S}()
+    end
+end
+
+function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.scan), args)
+    cb = ctx.cb
+    tt = ctx.tt
+
+    # Get input tile
+    input_tv = emit_value!(ctx, args[1])
+    input_tv === nothing && error("Cannot resolve input tile for scan")
+
+    # Get scan axis
+    axis = @something get_constant(ctx, args[2]) error("Scan axis must be a compile-time constant")
+
+    # Get scan function type
+    fn_type = @something get_constant(ctx, args[3]) error("Scan function type must be a compile-time constant")
+    fn_type == :add || fn_type == :mul || error("Scan function must be :add or :mul")
+
+    # Get reverse flag (optional, defaults to false)
+    reverse = false
+    if length(args) >= 4
+        reverse_val = get_constant(ctx, args[4])
+        reverse = reverse_val === true
+    end
+
+    # Get element type and shapes
+    input_type = unwrap_type(input_tv.jltype)
+    elem_type = input_type <: Tile ? input_type.parameters[1] : input_type
+    input_shape = input_tv.shape
+
+    # For scan, output shape is same as input shape
+    output_shape = copy(input_shape)
+
+    dtype = julia_to_tile_dtype!(tt, elem_type)
+
+    # Output tile type (same shape as input)
+    output_tile_type = tile_type!(tt, dtype, output_shape)
+
+    # Scalar type for scan body (0D tile)
+    scalar_tile_type = tile_type!(tt, dtype, Int[])
+
+    # Create identity value
+    # For cumsum: identity is 0.0 (represented as -0.0 for float)
+    # For cumprod: identity is 1.0
+    if fn_type == :add
+        identity_val = -0.0  # Negative zero works as additive identity
+    else  # :mul
+        identity_val = 1.0
+    end
+
+    # Choose identity type based on element type
+    if elem_type <: AbstractFloat
+        # Use float identity for float types
+        identity = ScanFloatIdentity(identity_val, dtype, elem_type)
+    elseif elem_type <: Integer
+        # Use integer identity for integer types
+        identity_val_int = fn_type == :add ? Int64(0) : Int64(1)
+        is_signed = elem_type <: Signed
+        identity = ScanIntegerIdentity(identity_val_int, dtype, elem_type, is_signed)
+    else
+        error("Unsupported element type for scan: $elem_type")
+    end
+
+    # Emit ScanOp
+    results = encode_ScanOp!(cb, [output_tile_type], [input_tv.v], axis, reverse, [identity], [scalar_tile_type]) do block_args
+        acc, elem = block_args[1], block_args[2]
+        res = encode_scan_body(cb, scalar_tile_type, acc, elem, Val(fn_type), elem_type)
+        encode_YieldOp!(cb, [res])
+    end
+
+    CGVal(results[1], output_tile_type, Tile{elem_type, Tuple(output_shape)}, output_shape)
+end
+
+# Dispatch helpers for scan body operations - dispatch on Val{fn} and elem_type
+encode_scan_body(cb, type, acc, elem, ::Val{:add}, ::Type{T}) where T <: AbstractFloat =
+    encode_AddFOp!(cb, type, acc, elem)
+encode_scan_body(cb, type, acc, elem, ::Val{:add}, ::Type{T}) where T <: Integer =
+    encode_AddIOp!(cb, type, acc, elem)
+encode_scan_body(cb, type, acc, elem, ::Val{:mul}, ::Type{T}) where T <: AbstractFloat =
+    encode_MulFOp!(cb, type, acc, elem)
+encode_scan_body(cb, type, acc, elem, ::Val{:mul}, ::Type{T}) where T <: Integer =
+    encode_MulIOp!(cb, type, acc, elem)
+
 
 # cuda_tile.select
 @eval Intrinsics begin
diff --git a/src/language/operations.jl b/src/language/operations.jl
index 075ddb8..846a1f7 100644
--- a/src/language/operations.jl
+++ b/src/language/operations.jl
@@ -532,6 +532,7 @@ end
     Intrinsics.reduce_max(tile, Val(axis - 1))
 end
 
+<<<<<<< HEAD
 """
     reduce_mul(tile::Tile{T, S}, axis::Integer) -> Tile{T, reduced_shape}
 
@@ -622,7 +623,25 @@ result = ct.reduce_xor(tile, 2)  # Returns (128,) tile of Int32
     Intrinsics.reduce_xor(tile, Val(axis - 1))
 end
 @inline function reduce_xor(tile::Tile{T, S}, ::Val{axis}) where {T <: Integer, S, axis}
-    Intrinsics.reduce_xor(tile, Val(axis - 1))
+Intrinsics.reduce_xor(tile, Val(axis - 1))
+end
+
+# Scan (Prefix Sum) Operations
+
+@inline function scan(tile::Tile{T, S}, ::Val{axis},
+                      fn::Symbol=:add,
+                      reverse::Bool=false) where {T, S, axis}
+    Intrinsics.scan(tile, Val(axis), fn, reverse)
+end
+
+@inline function cumsum(tile::Tile{T, S}, ::Val{axis},
+                        reverse::Bool=false) where {T, S, axis}
+    scan(tile, Val(axis), :add, reverse)
+end
+
+@inline function cumprod(tile::Tile{T, S}, ::Val{axis},
+                         reverse::Bool=false) where {T, S, axis}
+    scan(tile, Val(axis), :mul, reverse)
 end
 
 #=============================================================================

From 6773baf4aeda0e36044b0e98378eb9741039c592 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 12:23:43 +0530
Subject: [PATCH 29/31] consolidating common identity ops with scanops branch

- We use same IdentityOp abstraction
- scan uses the same encodings as reduce op since they mathematically same ops.
- left room for dispatch, lets say scan,  to take custom path if needed.
---
 src/bytecode/encodings.jl       |  6 +--
 src/bytecode/writer.jl          | 94 ++-------------------------------
 src/compiler/intrinsics/core.jl |  4 +-
 3 files changed, 8 insertions(+), 96 deletions(-)

diff --git a/src/bytecode/encodings.jl b/src/bytecode/encodings.jl
index 650a5ed..a301257 100644
--- a/src/bytecode/encodings.jl
+++ b/src/bytecode/encodings.jl
@@ -1342,7 +1342,7 @@ end
                    operands::Vector{Value},
                    dim::Int,
                    reverse::Bool,
-                   identities::Vector{<:ScanIdentity},
+                   identities::Vector{<:IdentityOp},
                    body_scalar_types::Vector{TypeId})
 
 Encode a ScanOp (parallel prefix sum) operation.
@@ -1362,7 +1362,7 @@ function encode_ScanOp!(body::Function, cb::CodeBuilder,
                         operands::Vector{Value},
                         dim::Int,
                         reverse::Bool,
-                        identities::Vector{<:ScanIdentity},
+                        identities::Vector{<:IdentityOp},
                         body_scalar_types::Vector{TypeId})
     encode_varint!(cb.buf, Opcode.ScanOp)
 
@@ -1372,7 +1372,7 @@ function encode_ScanOp!(body::Function, cb::CodeBuilder,
     # Attributes: dim (int), reverse (bool), identities (array)
     encode_opattr_int!(cb, dim)
     encode_opattr_bool!(cb, reverse)
-    encode_scan_identity_array!(cb, identities)
+    encode_identity_array!(cb, identities)
 
     # Variadic operands
     encode_varint!(cb.buf, length(operands))
diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index 8f5b89b..cf37cbc 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -280,12 +280,12 @@ function encode_tagged_float!(cb::CodeBuilder, identity::FloatIdentityOp)
 end
 
 """
-    encode_tagged_int!(cb, identity::IntegerIdentityOp)
+    encode_tagged_integer!(cb, identity::IntegerIdentityOp)
 
 Encode a tagged integer identity attribute.
 Format: tag(Int=0x01) + typeid + ap_int(value)
 """
-function encode_tagged_int!(cb::CodeBuilder, identity::IntegerIdentityOp)
+function encode_tagged_integer!(cb::CodeBuilder, identity::IntegerIdentityOp)
     # Tag for Int attribute
     push!(cb.buf, 0x01)
     # Type ID
@@ -346,101 +346,13 @@ function encode_identity_array!(cb::CodeBuilder, identities::Vector{<:IdentityOp
     end
 end
 
-#=============================================================================
- Tagged attributes for ScanOp identity values
-=============================================================================#
-
-"""
-    ScanIdentity
-
-Abstract type for scan identity attributes.
-"""
-abstract type ScanIdentity end
-
-"""
-    ScanFloatIdentity(value, type_id, dtype)
-
-Float identity value for scan operations.
-"""
-struct ScanFloatIdentity <: ScanIdentity
-    value::Float64
-    type_id::TypeId
-    dtype::Type  # Float16, Float32, Float64, etc.
-end
-
-"""
-    ScanIntegerIdentity(value, type_id, dtype)
-
-Integer identity value for scan operations.
-"""
-struct ScanIntegerIdentity <: ScanIdentity
-    value::Int64
-    type_id::TypeId
-    dtype::Type  # Int8, Int16, Int32, Int64
-    signed::Bool  # true for signed, false for unsigned
-end
-
-"""
-    encode_tagged_scan_identity!(cb, identity::ScanIdentity)
-
-Encode a scan identity attribute.
-"""
-function encode_tagged_scan_identity!(cb::CodeBuilder, identity::ScanIdentity)
-    if identity isa ScanFloatIdentity
-        encode_tagged_scan_float!(cb, identity)
-    elseif identity isa ScanIntegerIdentity
-        encode_tagged_scan_integer!(cb, identity)
-    else
-        error("Unsupported scan identity type: $(typeof(identity))")
-    end
-end
-
-"""
-    encode_tagged_scan_float!(cb, identity::ScanFloatIdentity)
-
-Encode a tagged float attribute for scan identity.
-"""
-function encode_tagged_scan_float!(cb::CodeBuilder, identity::ScanFloatIdentity)
-    push!(cb.buf, 0x02)  # Tag for Float attribute
-    encode_typeid!(cb.buf, identity.type_id)
-    bits = float_to_bits(identity.value, identity.dtype)
-    encode_signed_varint!(cb.buf, bits)
-end
-
-"""
-    encode_tagged_scan_integer!(cb, identity::ScanIntegerIdentity)
-
-Encode a tagged integer attribute for scan identity.
-"""
-function encode_tagged_scan_integer!(cb::CodeBuilder, identity::ScanIntegerIdentity)
-    push!(cb.buf, 0x01)  # Tag for Integer attribute
-    encode_typeid!(cb.buf, identity.type_id)
-    if identity.signed
-        encode_signed_varint!(cb.buf, identity.value)
-    else
-        encode_varint!(cb.buf, UInt64(identity.value))
-    end
-end
-
-"""
-    encode_scan_identity_array!(cb, identities)
-
-Encode an array of scan identity attributes.
-"""
-function encode_scan_identity_array!(cb::CodeBuilder, identities::Vector{<:ScanIdentity})
-    encode_varint!(cb.buf, length(identities))
-    for identity in identities
-        encode_tagged_scan_identity!(cb, identity)
-    end
-end
-
 """
     encode_identity!(cb, identity)
 
 Encode a single identity attribute, dispatching on type.
 """
 encode_identity!(cb::CodeBuilder, identity::FloatIdentityOp) = encode_tagged_float!(cb, identity)
-encode_identity!(cb::CodeBuilder, identity::IntegerIdentityOp) = encode_tagged_int!(cb, identity)
+encode_identity!(cb::CodeBuilder, identity::IntegerIdentityOp) = encode_tagged_integer!(cb, identity)
 
 """
     BytecodeWriter
diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index 3d1f6a0..3703a7e 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -884,12 +884,12 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.scan), args)
     # Choose identity type based on element type
     if elem_type <: AbstractFloat
         # Use float identity for float types
-        identity = ScanFloatIdentity(identity_val, dtype, elem_type)
+        identity = FloatIdentityOp(identity_val, dtype, elem_type)
     elseif elem_type <: Integer
         # Use integer identity for integer types
         identity_val_int = fn_type == :add ? Int64(0) : Int64(1)
         is_signed = elem_type <: Signed
-        identity = ScanIntegerIdentity(identity_val_int, dtype, elem_type, is_signed)
+        identity = IntegerIdentityOp(identity_val_int, dtype, elem_type, is_signed)
     else
         error("Unsupported element type for scan: $elem_type")
     end

From 9b04ce720de2a035df33db0f9f2fd4aadc1c97db Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 15:34:54 +0530
Subject: [PATCH 30/31] scan: add min/max support for float and integer types

- Added encode_scan_body methods for :min and :max operations
- Float types use MinFOp/MaxFOp (no signedness needed)
- Integer types use MinIOp/MaxIOp with signedness parameter
- Aligns scan implementation with existing reduce pattern
---
 src/compiler/intrinsics/core.jl | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index 3703a7e..71e6f65 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -913,6 +913,14 @@ encode_scan_body(cb, type, acc, elem, ::Val{:mul}, ::Type{T}) where T <: Abstrac
     encode_MulFOp!(cb, type, acc, elem)
 encode_scan_body(cb, type, acc, elem, ::Val{:mul}, ::Type{T}) where T <: Integer =
     encode_MulIOp!(cb, type, acc, elem)
+encode_scan_body(cb, type, acc, elem, ::Val{:min}, ::Type{T}) where T <: AbstractFloat =
+    encode_MinFOp!(cb, type, acc, elem)
+encode_scan_body(cb, type, acc, elem, ::Val{:max}, ::Type{T}) where T <: AbstractFloat =
+    encode_MaxFOp!(cb, type, acc, elem)
+encode_scan_body(cb, type, acc, elem, ::Val{:min}, ::Type{T}) where T <: Integer =
+    encode_MinIOp!(cb, type, acc, elem; signedness=is_signed(T) ? SignednessSigned : SignednessUnsigned)
+encode_scan_body(cb, type, acc, elem, ::Val{:max}, ::Type{T}) where T <: Integer =
+    encode_MaxIOp!(cb, type, acc, elem; signedness=is_signed(T) ? SignednessSigned : SignednessUnsigned)
 
 
 # cuda_tile.select

From 5e6e1a1b88007615d69579c3ed78244770334805 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 17:06:50 +0530
Subject: [PATCH 31/31] simple scan kernel example.

---
 examples/scanKernel.jl | 62 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 examples/scanKernel.jl

diff --git a/examples/scanKernel.jl b/examples/scanKernel.jl
new file mode 100644
index 0000000..b5dd8ea
--- /dev/null
+++ b/examples/scanKernel.jl
@@ -0,0 +1,62 @@
+using Test
+using CUDA
+using cuTile
+import cuTile as ct
+
+function cumsum_1d_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1},
+                          tile_size::ct.Constant{Int})
+    bid = ct.bid(1)
+    tile = ct.load(a, bid, (tile_size[],))
+    result = ct.cumsum(tile, ct.axis(1))
+    ct.store(b, bid, result)
+    return nothing
+end
+
+sz = 32
+N = 2^15
+a = CUDA.rand(Float32, N)
+b = CUDA.zeros(Float32, N)
+CUDA.@sync ct.launch(cumsum_1d_kernel, cld(length(a), sz), a, b, ct.Constant(sz))
+
+# This is supposed to be a single pass kernel but its simpler version than memory ordering version.
+# The idea is to show scan operation.
+
+# CSDL phase 1: Intra-tile scan + store tile sums
+function cumsum_csdl_phase1(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1},
+                            tile_sums::ct.TileArray{Float32,1},
+                            tile_size::ct.Constant{Int})
+    bid = ct.bid(1)
+    tile = ct.load(a, bid, (tile_size[],))
+    result = ct.cumsum(tile, ct.axis(1))
+    ct.store(b, bid, result)
+    tile_sum = ct.extract(result, (tile_size[],), (1,))
+    ct.store(tile_sums, bid, tile_sum)
+    return
+end
+
+# CSDL phase 2: Decoupled lookback to accumulate previous tile sums
+function cumsum_csdl_phase2(b::ct.TileArray{Float32,1},
+                            tile_sums::ct.TileArray{Float32,1},
+                            tile_size::ct.Constant{Int})
+    bid = ct.bid(1)
+    prev_sum = ct.zeros((tile_size[],), Float32)
+    k = Int32(bid)
+    while k > 1
+        tile_sum_k = ct.load(tile_sums, (k,), (1,))
+        prev_sum = prev_sum .+ tile_sum_k
+        k -= Int32(1)
+    end
+    tile = ct.load(b, bid, (tile_size[],))
+    result = tile .+ prev_sum
+    ct.store(b, bid, result)
+    return nothing
+end
+
+n = length(a)
+num_tiles = cld(n, sz)
+tile_sums = CUDA.zeros(Float32, num_tiles)
+CUDA.@sync ct.launch(cumsum_csdl_phase1, num_tiles, a, b, tile_sums, ct.Constant(sz))
+CUDA.@sync ct.launch(cumsum_csdl_phase2, num_tiles, b, tile_sums, ct.Constant(sz))
+
+b_cpu = cumsum(a |> collect, dims=1)
+@test isapprox(b |> collect, b_cpu)