From 3bc8521438dbcb1e04cd95c709a5c479314fbf09 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Sun, 11 Jan 2026 18:53:32 +0530
Subject: [PATCH 01/29] intrinsics: extend reduce operations with mul, min,
 and, or, xor for both float and integer types

- Add reduce_mul, reduce_min, reduce_and, reduce_or, reduce_xor intrinsic functions
- Remove AbstractFloat constraint from reduce_sum and reduce_max
- Add corresponding emit_intrinsic! handlers for new operations
- Add integer encode_reduce_body methods for and, or, xor operations

Summary of Additions

| Function | Symbol | Types |
|----------|--------|-------|
| `reduce_sum` | `:add` | Any |
| `reduce_max` | `:max` | Any |
| `reduce_mul` | `:mul` | Any |
| `reduce_min` | `:min` | Any |
| `reduce_and` | `:and` | Integer only |
| `reduce_or` | `:or` | Integer only |
| `reduce_xor` | `:xor` | Integer only |
---
 src/compiler/intrinsics/core.jl | 114 +++++++++++++++++++++++++++++---
 1 file changed, 104 insertions(+), 10 deletions(-)

diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index 7fb2530..f43504d 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -512,7 +512,7 @@ end
     Sum reduction along 0-indexed axis.
     Compiled to cuda_tile.reduce with ADD.
     """
-    @noinline function reduce_sum(tile::Tile{T, S}, ::Val{axis}) where {T <: AbstractFloat, S, axis}
+    @noinline function reduce_sum(tile::Tile{T, S}, ::Val{axis}) where {T, S, axis}
         reduced_shape = ntuple(i -> S[i < axis + 1 ? i : i + 1], length(S) - 1)
         Tile{T, reduced_shape}()
     end
@@ -523,7 +523,65 @@ end
     Maximum reduction along 0-indexed axis.
     Compiled to cuda_tile.reduce with MAX.
     """
-    @noinline function reduce_max(tile::Tile{T, S}, ::Val{axis}) where {T <: AbstractFloat, S, axis}
+    @noinline function reduce_max(tile::Tile{T, S}, ::Val{axis}) where {T, S, axis}
+        reduced_shape = ntuple(i -> S[i < axis + 1 ? i : i + 1], length(S) - 1)
+        Tile{T, reduced_shape}()
+    end
+
+    """
+        reduce_mul(tile, axis_val)
+
+    Product reduction along 0-indexed axis.
+    Compiled to cuda_tile.reduce with MUL.
+    """
+    @noinline function reduce_mul(tile::Tile{T, S}, ::Val{axis}) where {T, S, axis}
+        reduced_shape = ntuple(i -> S[i < axis + 1 ? i : i + 1], length(S) - 1)
+        Tile{T, reduced_shape}()
+    end
+
+    """
+        reduce_min(tile, axis_val)
+
+    Minimum reduction along 0-indexed axis.
+    Compiled to cuda_tile.reduce with MIN.
+    """
+    @noinline function reduce_min(tile::Tile{T, S}, ::Val{axis}) where {T, S, axis}
+        reduced_shape = ntuple(i -> S[i < axis + 1 ? i : i + 1], length(S) - 1)
+        Tile{T, reduced_shape}()
+    end
+
+    """
+        reduce_and(tile, axis_val)
+
+    Bitwise AND reduction along 0-indexed axis.
+    Compiled to cuda_tile.reduce with AND.
+    Integer types only.
+    """
+    @noinline function reduce_and(tile::Tile{T, S}, ::Val{axis}) where {T <: Integer, S, axis}
+        reduced_shape = ntuple(i -> S[i < axis + 1 ? i : i + 1], length(S) - 1)
+        Tile{T, reduced_shape}()
+    end
+
+    """
+        reduce_or(tile, axis_val)
+
+    Bitwise OR reduction along 0-indexed axis.
+    Compiled to cuda_tile.reduce with OR.
+    Integer types only.
+    """
+    @noinline function reduce_or(tile::Tile{T, S}, ::Val{axis}) where {T <: Integer, S, axis}
+        reduced_shape = ntuple(i -> S[i < axis + 1 ? i : i + 1], length(S) - 1)
+        Tile{T, reduced_shape}()
+    end
+
+    """
+        reduce_xor(tile, axis_val)
+
+    Bitwise XOR reduction along 0-indexed axis.
+    Compiled to cuda_tile.reduce with XOR.
+    Integer types only.
+    """
+    @noinline function reduce_xor(tile::Tile{T, S}, ::Val{axis}) where {T <: Integer, S, axis}
         reduced_shape = ntuple(i -> S[i < axis + 1 ? i : i + 1], length(S) - 1)
         Tile{T, reduced_shape}()
     end
@@ -534,6 +592,22 @@ end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.reduce_max), args)
     emit_reduce!(ctx, args, :max)
 end
+function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.reduce_mul), args)
+    emit_reduce!(ctx, args, :mul)
+end
+function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.reduce_min), args)
+    emit_reduce!(ctx, args, :min)
+end
+function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.reduce_and), args)
+    emit_reduce!(ctx, args, :and)
+end
+function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.reduce_or), args)
+    emit_reduce!(ctx, args, :or)
+end
+function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.reduce_xor), args)
+    emit_reduce!(ctx, args, :xor)
+end
+
 function emit_reduce!(ctx::CGCtx, args, reduce_fn::Symbol)
     cb = ctx.cb
     tt = ctx.tt
@@ -570,20 +644,40 @@ function emit_reduce!(ctx::CGCtx, args, reduce_fn::Symbol)
     results = encode_ReduceOp!(cb, [output_tile_type], [input_tv.v], axis, [identity], [scalar_tile_type]) do block_args
         acc, elem = block_args[1], block_args[2]
 
-        if reduce_fn == :add
-            res = encode_AddFOp!(cb, scalar_tile_type, acc, elem)
-        elseif reduce_fn == :max
-            res = encode_MaxFOp!(cb, scalar_tile_type, acc, elem)
-        else
-            error("Unsupported reduction function: $reduce_fn")
-        end
-
+        res = encode_reduce_body(cb, scalar_tile_type, acc, elem, Val(reduce_fn), elem_type)
         encode_YieldOp!(cb, [res])
     end
 
     CGVal(results[1], output_tile_type, Tile{elem_type, Tuple(output_shape)}, output_shape)
 end
 
+# Dispatch helpers for reduce body operations - dispatch on Val{fn} and elem_type
+encode_reduce_body(cb, type, acc, elem, ::Val{:add}, ::Type{T}) where T <: AbstractFloat =
+    encode_AddFOp!(cb, type, acc, elem)
+encode_reduce_body(cb, type, acc, elem, ::Val{:max}, ::Type{T}) where T <: AbstractFloat =
+    encode_MaxFOp!(cb, type, acc, elem)
+encode_reduce_body(cb, type, acc, elem, ::Val{:mul}, ::Type{T}) where T <: AbstractFloat =
+    encode_MulFOp!(cb, type, acc, elem)
+encode_reduce_body(cb, type, acc, elem, ::Val{:min}, ::Type{T}) where T <: AbstractFloat =
+    encode_MinFOp!(cb, type, acc, elem)
+encode_reduce_body(cb, type, acc, elem, ::Val{:add}, ::Type{T}) where T <: Integer =
+    encode_AddIOp!(cb, type, acc, elem)
+encode_reduce_body(cb, type, acc, elem, ::Val{:max}, ::Type{T}) where T <: Integer =
+    encode_MaxIOp!(cb, type, acc, elem)
+encode_reduce_body(cb, type, acc, elem, ::Val{:mul}, ::Type{T}) where T <: Integer =
+    encode_MulIOp!(cb, type, acc, elem)
+encode_reduce_body(cb, type, acc, elem, ::Val{:min}, ::Type{T}) where T <: Integer =
+    encode_MinIOp!(cb, type, acc, elem)
+
+
+# less likely commutative/associative ops can be reduced too for whatever reason.
+# eg: and, or, xor.
+encode_reduce_body(cb, type, acc, elem, ::Val{:and}, ::Type{T}) where T <: Integer =
+    encode_AndIOp!(cb, type, acc, elem)
+encode_reduce_body(cb, type, acc, elem, ::Val{:or}, ::Type{T}) where T <: Integer =
+    encode_OrIOp!(cb, type, acc, elem)
+encode_reduce_body(cb, type, acc, elem, ::Val{:xor}, ::Type{T}) where T <: Integer =
+    encode_XOrIOp!(cb, type, acc, elem)
 
 # cuda_tile.reshape
 @eval Intrinsics begin

From 2b21f7353b6ef68598f8b59ea373119fc8881b24 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Fri, 16 Jan 2026 17:00:04 +0530
Subject: [PATCH 02/29] operations: add axis(i) helper for 1-based to 0-based
 axis conversion

- Add axis(i::Integer) -> Val{i-1} convenience function
- Use instead of raw Val for self-documenting axis selection
---
 src/language/operations.jl | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/src/language/operations.jl b/src/language/operations.jl
index 463a358..856029c 100644
--- a/src/language/operations.jl
+++ b/src/language/operations.jl
@@ -59,6 +59,24 @@ Axis is 1-indexed. Equivalent to cld(arr.sizes[axis], shape[axis]).
     Intrinsics.get_index_space_shape(pv, axis - One())  # convert to 0-indexed
 end
 
+"""
+    axis(i::Integer) -> Val{i-1}
+
+Return a compile-time axis selector for tile operations.
+Axis indices are 1-based (axis(1) = first dimension, axis(2) = second, etc.).
+Internally converts to 0-based for Tile IR.
+
+Use this instead of raw `Val` for self-documenting code.
+
+# Examples
+```julia
+ct.cumsum(tile, ct.axis(1))   # Scan along first axis
+ct.cumsum(tile, ct.axis(2))   # Scan along second axis
+ct.scan(tile, ct.axis(1), :add)
+```
+"""
+@inline axis(i::Integer) = Val(i - One())
+
 """
     load(arr::TileArray, index, shape; padding_mode=PaddingMode.Undetermined, latency=nothing, allow_tma=true) -> Tile
 
@@ -628,4 +646,3 @@ br = ct.extract(tile, (2, 2), (4, 4))  # Bottom-right (rows 5-8, cols 5-8)
     Intrinsics.extract(tile, Val(map(i -> i - 1, index)), Val(shape))
 @inline extract(tile::Tile{T}, ::Val{Index}, ::Val{Shape}) where {T, Index, Shape} =
     Intrinsics.extract(tile, Val(map(i -> i - 1, Index)), Val(Shape))
-

From 3f9584fe72ed9bfed19a122e3dd9e7f1e67b1d11 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Sun, 11 Jan 2026 19:03:44 +0530
Subject: [PATCH 03/29] make axis public

---
 src/language/operations.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/language/operations.jl b/src/language/operations.jl
index 856029c..be9b212 100644
--- a/src/language/operations.jl
+++ b/src/language/operations.jl
@@ -10,7 +10,7 @@
  Load/Store
 =============================================================================#
 
-public bid, num_blocks, num_tiles, load, store, gather, scatter
+public bid, num_blocks, num_tiles, axis, load, store, gather, scatter
 
 """
 Padding mode for load operations.

From 43cd64b524cfd9a58d565099068d9368e936ef04 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Sun, 11 Jan 2026 19:18:26 +0530
Subject: [PATCH 04/29] make  new reduce_{ops} public

---
 src/language/operations.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/language/operations.jl b/src/language/operations.jl
index be9b212..e1de7be 100644
--- a/src/language/operations.jl
+++ b/src/language/operations.jl
@@ -533,7 +533,7 @@ result = ct.astype(acc, ct.TFloat32)  # Convert to TF32 for tensor cores
  Reduction
 =============================================================================#
 
-public reduce_sum, reduce_max
+public reduce_sum, reduce_max, reduce_mul, reduce_min, reduce_and, reduce_or, reduce_xor
 
 """
     reduce_sum(tile::Tile{T, S}, axis::Integer) -> Tile{T, reduced_shape}

From 12ebf3777c9cf61f82f8793a46d2b73f347ee9f2 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Sun, 11 Jan 2026 19:24:27 +0530
Subject: [PATCH 05/29] reduce ops update and axis convenience

axis convenience is a bit helper function for `Val`.

But I see reduce is already one-based. Not sure if we should go with it. It doesn't harm anything. its just a convenience.
---
 README.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/README.md b/README.md
index 117267d..140e106 100644
--- a/README.md
+++ b/README.md
@@ -160,7 +160,12 @@ conservative token threading in the compiler (see https://github.com/JuliaGPU/cu
 | Operation | Description |
 |-----------|-------------|
 | `reduce_sum(tile, axis)` | Sum along axis |
+| `reduce_mul(tile, axis)` | Product along axis |
 | `reduce_max(tile, axis)` | Maximum along axis |
+| `reduce_min(tile, axis)` | Minimum along axis |
+| `reduce_and(tile, axis)` | Bitwise AND along axis (integer) |
+| `reduce_or(tile, axis)` | Bitwise OR along axis (integer) |
+| `reduce_xor(tile, axis)` | Bitwise XOR along axis (integer) |
 
 ### Math
 | Operation | Description |
@@ -275,6 +280,11 @@ ct.permute(tile, (3, 1, 2))
 
 This applies to `bid`, `num_blocks`, `permute`, `reshape`, dimension arguments, etc.
 
+### axis convenience
+
+| `axis(i)` | Convert 1-based axis to 0-based (helper) |
+
+
 ### `Val`-like constants
 
 CuTile.jl uses `ct.Constant{T}` to encode compile-time constant values in the type domain, similar to how `Val` works. An explicit `[]` is needed to extract the value at runtime:

From d1c977aa713d99bef4921fa389b61fed4320b245 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Sun, 11 Jan 2026 19:58:00 +0530
Subject: [PATCH 06/29] reduce ops: add wrapper functions and correct identity
 values via dispatch

- Add wrapper functions in operations.jl for reduce_mul, reduce_min, reduce_and,
  reduce_or, reduce_xor with appropriate type constraints
- Refactor identity value selection to use dispatch instead of if-else chain
- Correct identity values:
  - add: 0.0
  - max: -Inf (float) or 0 (int)
  - mul: 1.0
  - min: +Inf (float) or typemax(Int64) (int)
  - and: 0 (interpreted as -1 bits by backend)
  - or: 0
  - xor: 0
---
 src/compiler/intrinsics/core.jl | 57 ++++++++++++++++++--
 src/language/operations.jl      | 93 +++++++++++++++++++++++++++++++++
 2 files changed, 145 insertions(+), 5 deletions(-)

diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index f43504d..c71ff08 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -632,13 +632,11 @@ function emit_reduce!(ctx::CGCtx, args, reduce_fn::Symbol)
 
     # Output tile type
     output_tile_type = tile_type!(tt, dtype, output_shape)
-
     # Scalar type for reduction body (0D tile)
     scalar_tile_type = tile_type!(tt, dtype, Int[])
 
-    # Create identity value - use simple dtype (f32), not tile type
-    identity_val = reduce_fn == :add ? -0.0 : (reduce_fn == :max ? -Inf : 0.0)
-    identity = FloatIdentity(identity_val, dtype, elem_type)
+    # Create identity value via dispatch on reduction function and element type
+    identity = reduce_identity(Val(reduce_fn), dtype, elem_type)
 
     # Emit ReduceOp
     results = encode_ReduceOp!(cb, [output_tile_type], [input_tv.v], axis, [identity], [scalar_tile_type]) do block_args
@@ -651,7 +649,56 @@ function emit_reduce!(ctx::CGCtx, args, reduce_fn::Symbol)
     CGVal(results[1], output_tile_type, Tile{elem_type, Tuple(output_shape)}, output_shape)
 end
 
-# Dispatch helpers for reduce body operations - dispatch on Val{fn} and elem_type
+#=============================================================================
+ Reduce Identity Values via Dispatch
+=============================================================================#
+
+"""
+    reduce_identity(reduce_fn, dtype, elem_type) -> ReduceIdentity
+
+Return the identity value for a reduction operation.
+Identity must satisfy: identity ⊕ x = x for the reduction operation.
+"""
+# Addition identity: 0 + x = x
+reduce_identity(::Val{:add}, dtype, ::Type{T}) where T <: AbstractFloat =
+    FloatIdentity(0.0, dtype, T)
+reduce_identity(::Val{:add}, dtype, ::Type{T}) where T <: Integer =
+    FloatIdentity(0.0, dtype, T)
+
+# Maximum identity: max(-Inf, x) = x
+reduce_identity(::Val{:max}, dtype, ::Type{T}) where T <: AbstractFloat =
+    FloatIdentity(-Inf, dtype, T)
+reduce_identity(::Val{:max}, dtype, ::Type{T}) where T <: Integer =
+    FloatIdentity(0.0, dtype, T)  # For integers, use 0 as identity (max(0, x) = x)
+
+# Multiplication identity: 1 * x = x
+reduce_identity(::Val{:mul}, dtype, ::Type{T}) where T <: AbstractFloat =
+    FloatIdentity(1.0, dtype, T)
+reduce_identity(::Val{:mul}, dtype, ::Type{T}) where T <: Integer =
+    FloatIdentity(1.0, dtype, T)
+
+# Minimum identity: min(+Inf, x) = x
+reduce_identity(::Val{:min}, dtype, ::Type{T}) where T <: AbstractFloat =
+    FloatIdentity(+Inf, dtype, T)
+reduce_identity(::Val{:min}, dtype, ::Type{T}) where T <: Integer =
+    FloatIdentity(typemax(Int64), dtype, T)  # Use max int as +Inf proxy
+
+# AND identity: all bits set (x & -1 == x)
+reduce_identity(::Val{:and}, dtype, ::Type{T}) where T <: Integer =
+    FloatIdentity(0.0, dtype, T)  # Will be interpreted as -1 bits by backend
+
+# OR identity: 0 | x = x
+reduce_identity(::Val{:or}, dtype, ::Type{T}) where T <: Integer =
+    FloatIdentity(0.0, dtype, T)
+
+# XOR identity: 0 ⊕ x = x
+reduce_identity(::Val{:xor}, dtype, ::Type{T}) where T <: Integer =
+    FloatIdentity(0.0, dtype, T)
+
+#=============================================================================
+ Reduce Body Operations - dispatch on Val{fn} and elem_type
+=============================================================================#
+
 encode_reduce_body(cb, type, acc, elem, ::Val{:add}, ::Type{T}) where T <: AbstractFloat =
     encode_AddFOp!(cb, type, acc, elem)
 encode_reduce_body(cb, type, acc, elem, ::Val{:max}, ::Type{T}) where T <: AbstractFloat =
diff --git a/src/language/operations.jl b/src/language/operations.jl
index e1de7be..d028c2c 100644
--- a/src/language/operations.jl
+++ b/src/language/operations.jl
@@ -571,6 +571,99 @@ end
     Intrinsics.reduce_max(tile, Val(axis - 1))
 end
 
+"""
+    reduce_mul(tile::Tile{T, S}, axis::Integer) -> Tile{T, reduced_shape}
+
+Product reduction along the specified axis (1-indexed).
+Returns a tile with the specified dimension removed.
+
+# Example
+```julia
+# For a (128, 64) tile, reducing along axis 2:
+products = ct.reduce_mul(tile, 2)  # Returns (128,) tile
+```
+"""
+@inline function reduce_mul(tile::Tile{T, S}, axis::Integer) where {T, S}
+    Intrinsics.reduce_mul(tile, Val(axis - 1))
+end
+@inline function reduce_mul(tile::Tile{T, S}, ::Val{axis}) where {T, S, axis}
+    Intrinsics.reduce_mul(tile, Val(axis - 1))
+end
+
+"""
+    reduce_min(tile::Tile{T, S}, axis::Integer) -> Tile{T, reduced_shape}
+
+Minimum reduction along the specified axis (1-indexed).
+
+# Example
+```julia
+mins = ct.reduce_min(tile, 2)  # Min along axis 2
+```
+"""
+@inline function reduce_min(tile::Tile{T, S}, axis::Integer) where {T, S}
+    Intrinsics.reduce_min(tile, Val(axis - 1))
+end
+@inline function reduce_min(tile::Tile{T, S}, ::Val{axis}) where {T, S, axis}
+    Intrinsics.reduce_min(tile, Val(axis - 1))
+end
+
+"""
+    reduce_and(tile::Tile{T, S}, axis::Integer) -> Tile{T, reduced_shape}
+
+Bitwise AND reduction along the specified axis (1-indexed).
+Integer types only.
+
+# Example
+```julia
+# For an Int32 tile, reducing along axis 2:
+result = ct.reduce_and(tile, 2)  # Returns (128,) tile of Int32
+```
+"""
+@inline function reduce_and(tile::Tile{T, S}, axis::Integer) where {T <: Integer, S}
+    Intrinsics.reduce_and(tile, Val(axis - 1))
+end
+@inline function reduce_and(tile::Tile{T, S}, ::Val{axis}) where {T <: Integer, S, axis}
+    Intrinsics.reduce_and(tile, Val(axis - 1))
+end
+
+"""
+    reduce_or(tile::Tile{T, S}, axis::Integer) -> Tile{T, reduced_shape}
+
+Bitwise OR reduction along the specified axis (1-indexed).
+Integer types only.
+
+# Example
+```julia
+# For an Int32 tile, reducing along axis 2:
+result = ct.reduce_or(tile, 2)  # Returns (128,) tile of Int32
+```
+"""
+@inline function reduce_or(tile::Tile{T, S}, axis::Integer) where {T <: Integer, S}
+    Intrinsics.reduce_or(tile, Val(axis - 1))
+end
+@inline function reduce_or(tile::Tile{T, S}, ::Val{axis}) where {T <: Integer, S, axis}
+    Intrinsics.reduce_or(tile, Val(axis - 1))
+end
+
+"""
+    reduce_xor(tile::Tile{T, S}, axis::Integer) -> Tile{T, reduced_shape}
+
+Bitwise XOR reduction along the specified axis (1-indexed).
+Integer types only.
+
+# Example
+```julia
+# For an Int32 tile, reducing along axis 2:
+result = ct.reduce_xor(tile, 2)  # Returns (128,) tile of Int32
+```
+"""
+@inline function reduce_xor(tile::Tile{T, S}, axis::Integer) where {T <: Integer, S}
+    Intrinsics.reduce_xor(tile, Val(axis - 1))
+end
+@inline function reduce_xor(tile::Tile{T, S}, ::Val{axis}) where {T <: Integer, S, axis}
+    Intrinsics.reduce_xor(tile, Val(axis - 1))
+end
+
 #=============================================================================
  Matrix multiplication
 =============================================================================#

From 31282c4c34324067e7c5d9d12ab5fa08d33901b3 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Sun, 11 Jan 2026 20:04:57 +0530
Subject: [PATCH 07/29] add IntIdentity type for integer reduce operations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add IntIdentity struct to bytecode/writer.jl for proper integer identity encoding
- Add encode_tagged_int! function for encoding integer identity attributes (tag 0x01)
- Dispatch encode_identity! on identity type for proper encoding
- Update reduce_identity to return IntIdentity for integer operations
- Import ReduceIdentity, FloatIdentity, IntIdentity in intrinsics.jl

Identity values now properly typed:
- Float operations → FloatIdentity
- Integer operations → IntIdentity
---
 src/bytecode/writer.jl          | 37 ++++++++++++++++++++++++++++++++-
 src/compiler/intrinsics.jl      |  1 +
 src/compiler/intrinsics/core.jl | 14 ++++++-------
 3 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index eb87585..500d5ad 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -251,6 +251,17 @@ struct FloatIdentity <: ReduceIdentity
     dtype::Type  # Float16, Float32, Float64, etc.
 end
 
+"""
+    IntIdentity(value, type_id, dtype)
+
+Integer identity value for reduce operations (and, or, xor).
+"""
+struct IntIdentity <: ReduceIdentity
+    value::Int64  # Store as signed Int64, will be reinterpreted as unsigned
+    type_id::TypeId
+    dtype::Type  # Int8, Int16, Int32, Int64, UInt8, etc.
+end
+
 """
     encode_tagged_float!(cb, identity::FloatIdentity)
 
@@ -267,6 +278,21 @@ function encode_tagged_float!(cb::CodeBuilder, identity::FloatIdentity)
     encode_signed_varint!(cb.buf, bits)
 end
 
+"""
+    encode_tagged_int!(cb, identity::IntIdentity)
+
+Encode a tagged integer attribute for reduce identity.
+Format: tag(Int=0x01) + typeid + ap_int(value)
+"""
+function encode_tagged_int!(cb::CodeBuilder, identity::IntIdentity)
+    # Tag for Int attribute
+    push!(cb.buf, 0x01)
+    # Type ID
+    encode_typeid!(cb.buf, identity.type_id)
+    # Value as signed varint
+    encode_signed_varint!(cb.buf, identity.value)
+end
+
 """
     float_to_bits(value, dtype)
 
@@ -305,14 +331,23 @@ end
     encode_identity_array!(cb, identities)
 
 Encode an array of reduce identity attributes.
+Dispatches on identity type to encode correctly.
 """
 function encode_identity_array!(cb::CodeBuilder, identities::Vector{<:ReduceIdentity})
     encode_varint!(cb.buf, length(identities))
     for identity in identities
-        encode_tagged_float!(cb, identity)
+        encode_identity!(cb, identity)
     end
 end
 
+"""
+    encode_identity!(cb, identity)
+
+Encode a single identity attribute, dispatching on type.
+"""
+encode_identity!(cb::CodeBuilder, identity::FloatIdentity) = encode_tagged_float!(cb, identity)
+encode_identity!(cb::CodeBuilder, identity::IntIdentity) = encode_tagged_int!(cb, identity)
+
 """
     BytecodeWriter
 
diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl
index 16c55da..32711d2 100644
--- a/src/compiler/intrinsics.jl
+++ b/src/compiler/intrinsics.jl
@@ -8,6 +8,7 @@ using Base: compilerbarrier, donotdelete
 using ..cuTile: Tile, TileArray, Constant, TensorView, PartitionView
 using ..cuTile: Signedness, SignednessSigned, SignednessUnsigned
 using ..cuTile: ComparisonPredicate, CmpLessThan, CmpLessThanOrEqual, CmpGreaterThan, CmpGreaterThanOrEqual, CmpEqual, CmpNotEqual
+using ..cuTile: ReduceIdentity, FloatIdentity, IntIdentity
 
 end
 
diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index c71ff08..c9477ec 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -663,37 +663,37 @@ Identity must satisfy: identity ⊕ x = x for the reduction operation.
 reduce_identity(::Val{:add}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentity(0.0, dtype, T)
 reduce_identity(::Val{:add}, dtype, ::Type{T}) where T <: Integer =
-    FloatIdentity(0.0, dtype, T)
+    IntIdentity(0, dtype, T)
 
 # Maximum identity: max(-Inf, x) = x
 reduce_identity(::Val{:max}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentity(-Inf, dtype, T)
 reduce_identity(::Val{:max}, dtype, ::Type{T}) where T <: Integer =
-    FloatIdentity(0.0, dtype, T)  # For integers, use 0 as identity (max(0, x) = x)
+    IntIdentity(0, dtype, T)  # For integers, use 0 as identity (max(0, x) = x)
 
 # Multiplication identity: 1 * x = x
 reduce_identity(::Val{:mul}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentity(1.0, dtype, T)
 reduce_identity(::Val{:mul}, dtype, ::Type{T}) where T <: Integer =
-    FloatIdentity(1.0, dtype, T)
+    IntIdentity(1, dtype, T)
 
 # Minimum identity: min(+Inf, x) = x
 reduce_identity(::Val{:min}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentity(+Inf, dtype, T)
 reduce_identity(::Val{:min}, dtype, ::Type{T}) where T <: Integer =
-    FloatIdentity(typemax(Int64), dtype, T)  # Use max int as +Inf proxy
+    IntIdentity(typemax(Int64), dtype, T)  # Use max int as +Inf proxy
 
 # AND identity: all bits set (x & -1 == x)
 reduce_identity(::Val{:and}, dtype, ::Type{T}) where T <: Integer =
-    FloatIdentity(0.0, dtype, T)  # Will be interpreted as -1 bits by backend
+    IntIdentity(-1, dtype, T)  # All bits set
 
 # OR identity: 0 | x = x
 reduce_identity(::Val{:or}, dtype, ::Type{T}) where T <: Integer =
-    FloatIdentity(0.0, dtype, T)
+    IntIdentity(0, dtype, T)
 
 # XOR identity: 0 ⊕ x = x
 reduce_identity(::Val{:xor}, dtype, ::Type{T}) where T <: Integer =
-    FloatIdentity(0.0, dtype, T)
+    IntIdentity(0, dtype, T)
 
 #=============================================================================
  Reduce Body Operations - dispatch on Val{fn} and elem_type

From e68967288eef4232b44f51b6334b250525aa56df Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Sun, 11 Jan 2026 20:08:02 +0530
Subject: [PATCH 08/29] rename IntIdentity to IntegerIdentity for clarity

---
 src/bytecode/writer.jl          | 10 +++++-----
 src/compiler/intrinsics.jl      |  2 +-
 src/compiler/intrinsics/core.jl | 14 +++++++-------
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index 500d5ad..94a8185 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -252,11 +252,11 @@ struct FloatIdentity <: ReduceIdentity
 end
 
 """
-    IntIdentity(value, type_id, dtype)
+    IntegerIdentity(value, type_id, dtype)
 
 Integer identity value for reduce operations (and, or, xor).
 """
-struct IntIdentity <: ReduceIdentity
+struct IntegerIdentity <: ReduceIdentity
     value::Int64  # Store as signed Int64, will be reinterpreted as unsigned
     type_id::TypeId
     dtype::Type  # Int8, Int16, Int32, Int64, UInt8, etc.
@@ -279,12 +279,12 @@ function encode_tagged_float!(cb::CodeBuilder, identity::FloatIdentity)
 end
 
 """
-    encode_tagged_int!(cb, identity::IntIdentity)
+    encode_tagged_int!(cb, identity::IntegerIdentity)
 
 Encode a tagged integer attribute for reduce identity.
 Format: tag(Int=0x01) + typeid + ap_int(value)
 """
-function encode_tagged_int!(cb::CodeBuilder, identity::IntIdentity)
+function encode_tagged_int!(cb::CodeBuilder, identity::IntegerIdentity)
     # Tag for Int attribute
     push!(cb.buf, 0x01)
     # Type ID
@@ -346,7 +346,7 @@ end
 Encode a single identity attribute, dispatching on type.
 """
 encode_identity!(cb::CodeBuilder, identity::FloatIdentity) = encode_tagged_float!(cb, identity)
-encode_identity!(cb::CodeBuilder, identity::IntIdentity) = encode_tagged_int!(cb, identity)
+encode_identity!(cb::CodeBuilder, identity::IntegerIdentity) = encode_tagged_int!(cb, identity)
 
 """
     BytecodeWriter
diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl
index 32711d2..7658c79 100644
--- a/src/compiler/intrinsics.jl
+++ b/src/compiler/intrinsics.jl
@@ -8,7 +8,7 @@ using Base: compilerbarrier, donotdelete
 using ..cuTile: Tile, TileArray, Constant, TensorView, PartitionView
 using ..cuTile: Signedness, SignednessSigned, SignednessUnsigned
 using ..cuTile: ComparisonPredicate, CmpLessThan, CmpLessThanOrEqual, CmpGreaterThan, CmpGreaterThanOrEqual, CmpEqual, CmpNotEqual
-using ..cuTile: ReduceIdentity, FloatIdentity, IntIdentity
+using ..cuTile: ReduceIdentity, FloatIdentity, IntegerIdentity
 
 end
 
diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index c9477ec..c91b9c5 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -663,37 +663,37 @@ Identity must satisfy: identity ⊕ x = x for the reduction operation.
 reduce_identity(::Val{:add}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentity(0.0, dtype, T)
 reduce_identity(::Val{:add}, dtype, ::Type{T}) where T <: Integer =
-    IntIdentity(0, dtype, T)
+    IntegerIdentity(0, dtype, T)
 
 # Maximum identity: max(-Inf, x) = x
 reduce_identity(::Val{:max}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentity(-Inf, dtype, T)
 reduce_identity(::Val{:max}, dtype, ::Type{T}) where T <: Integer =
-    IntIdentity(0, dtype, T)  # For integers, use 0 as identity (max(0, x) = x)
+    IntegerIdentity(0, dtype, T)  # For integers, use 0 as identity (max(0, x) = x)
 
 # Multiplication identity: 1 * x = x
 reduce_identity(::Val{:mul}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentity(1.0, dtype, T)
 reduce_identity(::Val{:mul}, dtype, ::Type{T}) where T <: Integer =
-    IntIdentity(1, dtype, T)
+    IntegerIdentity(1, dtype, T)
 
 # Minimum identity: min(+Inf, x) = x
 reduce_identity(::Val{:min}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentity(+Inf, dtype, T)
 reduce_identity(::Val{:min}, dtype, ::Type{T}) where T <: Integer =
-    IntIdentity(typemax(Int64), dtype, T)  # Use max int as +Inf proxy
+    IntegerIdentity(typemax(Int64), dtype, T)  # Use max int as +Inf proxy
 
 # AND identity: all bits set (x & -1 == x)
 reduce_identity(::Val{:and}, dtype, ::Type{T}) where T <: Integer =
-    IntIdentity(-1, dtype, T)  # All bits set
+    IntegerIdentity(-1, dtype, T)  # All bits set
 
 # OR identity: 0 | x = x
 reduce_identity(::Val{:or}, dtype, ::Type{T}) where T <: Integer =
-    IntIdentity(0, dtype, T)
+    IntegerIdentity(0, dtype, T)
 
 # XOR identity: 0 ⊕ x = x
 reduce_identity(::Val{:xor}, dtype, ::Type{T}) where T <: Integer =
-    IntIdentity(0, dtype, T)
+    IntegerIdentity(0, dtype, T)
 
 #=============================================================================
  Reduce Body Operations - dispatch on Val{fn} and elem_type

From aa67f22d542acbdd8f7cf4efc8eea86a4ef0ccdf Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Sun, 11 Jan 2026 23:47:57 +0530
Subject: [PATCH 09/29] fix: remove AbstractFloat constraint from reduce_sum
 and reduce_max

The intrinsics were updated to support all types but the wrapper functions
in operations.jl still had T <: AbstractFloat constraint, causing method
lookup failures for integer types.
---
 src/language/operations.jl | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/language/operations.jl b/src/language/operations.jl
index d028c2c..df5379a 100644
--- a/src/language/operations.jl
+++ b/src/language/operations.jl
@@ -541,16 +541,18 @@ public reduce_sum, reduce_max, reduce_mul, reduce_min, reduce_and, reduce_or, re
 Sum reduction along the specified axis (1-indexed).
 Returns a tile with the specified dimension removed.
 
+Supports any numeric type (Float16, Float32, Float64, and integer types).
+
 # Example
 ```julia
 # For a (128, 64) tile, reducing along axis 2:
 sums = ct.reduce_sum(tile, 2)  # Returns (128,) tile
 ```
 """
-@inline function reduce_sum(tile::Tile{T, S}, axis::Integer) where {T <: AbstractFloat, S}
+@inline function reduce_sum(tile::Tile{T, S}, axis::Integer) where {T, S}
     Intrinsics.reduce_sum(tile, Val(axis - 1))
 end
-@inline function reduce_sum(tile::Tile{T, S}, ::Val{axis}) where {T <: AbstractFloat, S, axis}
+@inline function reduce_sum(tile::Tile{T, S}, ::Val{axis}) where {T, S, axis}
     Intrinsics.reduce_sum(tile, Val(axis - 1))
 end
 
@@ -558,16 +560,17 @@ end
     reduce_max(tile::Tile{T, S}, axis::Integer) -> Tile{T, reduced_shape}
 
 Maximum reduction along the specified axis (1-indexed).
+Supports any numeric type (Float16, Float32, Float64, and integer types).
 
 # Example
 ```julia
 maxes = ct.reduce_max(tile, 2)  # Max along axis 2
 ```
 """
-@inline function reduce_max(tile::Tile{T, S}, axis::Integer) where {T <: AbstractFloat, S}
+@inline function reduce_max(tile::Tile{T, S}, axis::Integer) where {T, S}
     Intrinsics.reduce_max(tile, Val(axis - 1))
 end
-@inline function reduce_max(tile::Tile{T, S}, ::Val{axis}) where {T <: AbstractFloat, S, axis}
+@inline function reduce_max(tile::Tile{T, S}, ::Val{axis}) where {T, S, axis}
     Intrinsics.reduce_max(tile, Val(axis - 1))
 end
 

From eb98ec15007d538f4419b32bd5a118825c9bdaaf Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Sun, 11 Jan 2026 23:49:20 +0530
Subject: [PATCH 10/29] use Number constraint for numeric reduce operations

- reduce_sum, reduce_max, reduce_mul, reduce_min now use T <: Number
- Provides type safety while supporting all numeric types
- More self-documenting than unconstrained T
---
 src/language/operations.jl | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/language/operations.jl b/src/language/operations.jl
index df5379a..4e1de57 100644
--- a/src/language/operations.jl
+++ b/src/language/operations.jl
@@ -549,10 +549,10 @@ Supports any numeric type (Float16, Float32, Float64, and integer types).
 sums = ct.reduce_sum(tile, 2)  # Returns (128,) tile
 ```
 """
-@inline function reduce_sum(tile::Tile{T, S}, axis::Integer) where {T, S}
+@inline function reduce_sum(tile::Tile{T, S}, axis::Integer) where {T <: Number, S}
     Intrinsics.reduce_sum(tile, Val(axis - 1))
 end
-@inline function reduce_sum(tile::Tile{T, S}, ::Val{axis}) where {T, S, axis}
+@inline function reduce_sum(tile::Tile{T, S}, ::Val{axis}) where {T <: Number, S, axis}
     Intrinsics.reduce_sum(tile, Val(axis - 1))
 end
 
@@ -567,10 +567,10 @@ Supports any numeric type (Float16, Float32, Float64, and integer types).
 maxes = ct.reduce_max(tile, 2)  # Max along axis 2
 ```
 """
-@inline function reduce_max(tile::Tile{T, S}, axis::Integer) where {T, S}
+@inline function reduce_max(tile::Tile{T, S}, axis::Integer) where {T <: Number, S}
     Intrinsics.reduce_max(tile, Val(axis - 1))
 end
-@inline function reduce_max(tile::Tile{T, S}, ::Val{axis}) where {T, S, axis}
+@inline function reduce_max(tile::Tile{T, S}, ::Val{axis}) where {T <: Number, S, axis}
     Intrinsics.reduce_max(tile, Val(axis - 1))
 end
 
@@ -586,10 +586,10 @@ Returns a tile with the specified dimension removed.
 products = ct.reduce_mul(tile, 2)  # Returns (128,) tile
 ```
 """
-@inline function reduce_mul(tile::Tile{T, S}, axis::Integer) where {T, S}
+@inline function reduce_mul(tile::Tile{T, S}, axis::Integer) where {T <: Number, S}
     Intrinsics.reduce_mul(tile, Val(axis - 1))
 end
-@inline function reduce_mul(tile::Tile{T, S}, ::Val{axis}) where {T, S, axis}
+@inline function reduce_mul(tile::Tile{T, S}, ::Val{axis}) where {T <: Number, S, axis}
     Intrinsics.reduce_mul(tile, Val(axis - 1))
 end
 
@@ -603,10 +603,10 @@ Minimum reduction along the specified axis (1-indexed).
 mins = ct.reduce_min(tile, 2)  # Min along axis 2
 ```
 """
-@inline function reduce_min(tile::Tile{T, S}, axis::Integer) where {T, S}
+@inline function reduce_min(tile::Tile{T, S}, axis::Integer) where {T <: Number, S}
     Intrinsics.reduce_min(tile, Val(axis - 1))
 end
-@inline function reduce_min(tile::Tile{T, S}, ::Val{axis}) where {T, S, axis}
+@inline function reduce_min(tile::Tile{T, S}, ::Val{axis}) where {T <: Number, S, axis}
     Intrinsics.reduce_min(tile, Val(axis - 1))
 end
 

From e097fa794eb7de99b32aa09c1933291eb9b4608f Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Sun, 11 Jan 2026 23:57:34 +0530
Subject: [PATCH 11/29] add signed field to IntegerIdentity for proper
 signed/unsigned encoding

- IntegerIdentity now has signed::Bool field
- encode_tagged_int! encodes signed with zigzag varint, unsigned with plain varint
- Add is_signed() helper that checks T <: SignedInteger
- Update all reduce_identity calls to pass is_signed(T)
---
 src/bytecode/writer.jl          | 19 ++++++++++++-------
 src/compiler/intrinsics/core.jl | 22 +++++++++++++++-------
 2 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index 94a8185..bd796f5 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -252,14 +252,15 @@ struct FloatIdentity <: ReduceIdentity
 end
 
 """
-    IntegerIdentity(value, type_id, dtype)
+    IntegerIdentity(value, type_id, dtype, signed)
 
-Integer identity value for reduce operations (and, or, xor).
+Integer identity value for reduce operations (add, max, mul, min, and, or, xor).
 """
 struct IntegerIdentity <: ReduceIdentity
     value::Int64  # Store as signed Int64, will be reinterpreted as unsigned
     type_id::TypeId
-    dtype::Type  # Int8, Int16, Int32, Int64, UInt8, etc.
+    dtype::Type   # Int8, Int16, Int32, Int64, UInt8, etc.
+    signed::Bool  # true for signed, false for unsigned
 end
 
 """
@@ -279,18 +280,22 @@ function encode_tagged_float!(cb::CodeBuilder, identity::FloatIdentity)
 end
 
 """
-    encode_tagged_int!(cb, identity::IntegerIdentity)
+    encode_tagged_int!(cb, identity::IntegerIdentity; is_reduce::Bool=true)
 
 Encode a tagged integer attribute for reduce identity.
 Format: tag(Int=0x01) + typeid + ap_int(value)
 """
-function encode_tagged_int!(cb::CodeBuilder, identity::IntegerIdentity)
+function encode_tagged_int!(cb::CodeBuilder, identity::IntegerIdentity; is_reduce::Bool=true)
     # Tag for Int attribute
     push!(cb.buf, 0x01)
     # Type ID
     encode_typeid!(cb.buf, identity.type_id)
-    # Value as signed varint
-    encode_signed_varint!(cb.buf, identity.value)
+    # Value: signed uses zigzag varint, unsigned uses plain varint
+    if identity.signed
+        encode_signed_varint!(cb.buf, identity.value)
+    else
+        encode_varint!(cb.buf, UInt64(identity.value))
+    end
 end
 
 """
diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index c91b9c5..cc15c25 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -653,6 +653,14 @@ end
  Reduce Identity Values via Dispatch
 =============================================================================#
 
+"""
+    is_signed(::Type{T}) -> Bool
+
+Return true if type T is signed, false for unsigned types.
+"""
+is_signed(::Type{T}) where T <: Integer = T <: SignedInteger
+is_signed(::Type{T}) where T <: AbstractFloat = false
+
 """
     reduce_identity(reduce_fn, dtype, elem_type) -> ReduceIdentity
 
@@ -663,37 +671,37 @@ Identity must satisfy: identity ⊕ x = x for the reduction operation.
 reduce_identity(::Val{:add}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentity(0.0, dtype, T)
 reduce_identity(::Val{:add}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentity(0, dtype, T)
+    IntegerIdentity(0, dtype, T, is_signed(T))
 
 # Maximum identity: max(-Inf, x) = x
 reduce_identity(::Val{:max}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentity(-Inf, dtype, T)
 reduce_identity(::Val{:max}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentity(0, dtype, T)  # For integers, use 0 as identity (max(0, x) = x)
+    IntegerIdentity(0, dtype, T, is_signed(T))  # For integers, use 0 as identity (max(0, x) = x)
 
 # Multiplication identity: 1 * x = x
 reduce_identity(::Val{:mul}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentity(1.0, dtype, T)
 reduce_identity(::Val{:mul}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentity(1, dtype, T)
+    IntegerIdentity(1, dtype, T, is_signed(T))
 
 # Minimum identity: min(+Inf, x) = x
 reduce_identity(::Val{:min}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentity(+Inf, dtype, T)
 reduce_identity(::Val{:min}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentity(typemax(Int64), dtype, T)  # Use max int as +Inf proxy
+    IntegerIdentity(typemax(Int64), dtype, T, is_signed(T))  # Use max int as +Inf proxy
 
 # AND identity: all bits set (x & -1 == x)
 reduce_identity(::Val{:and}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentity(-1, dtype, T)  # All bits set
+    IntegerIdentity(-1, dtype, T, is_signed(T))  # All bits set
 
 # OR identity: 0 | x = x
 reduce_identity(::Val{:or}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentity(0, dtype, T)
+    IntegerIdentity(0, dtype, T, is_signed(T))
 
 # XOR identity: 0 ⊕ x = x
 reduce_identity(::Val{:xor}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentity(0, dtype, T)
+    IntegerIdentity(0, dtype, T, is_signed(T))
 
 #=============================================================================
  Reduce Body Operations - dispatch on Val{fn} and elem_type

From 66520878048b40e53c2e71674ea338efb94c2fa8 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 00:02:23 +0530
Subject: [PATCH 12/29] remove unused is_reduce kwarg from encode_tagged_int

---
 src/bytecode/writer.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index bd796f5..78fc2b8 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -280,12 +280,12 @@ function encode_tagged_float!(cb::CodeBuilder, identity::FloatIdentity)
 end
 
 """
-    encode_tagged_int!(cb, identity::IntegerIdentity; is_reduce::Bool=true)
+    encode_tagged_int!(cb, identity::IntegerIdentity)
 
-Encode a tagged integer attribute for reduce identity.
+Encode a tagged integer identity attribute.
 Format: tag(Int=0x01) + typeid + ap_int(value)
 """
-function encode_tagged_int!(cb::CodeBuilder, identity::IntegerIdentity; is_reduce::Bool=true)
+function encode_tagged_int!(cb::CodeBuilder, identity::IntegerIdentity)
     # Tag for Int attribute
     push!(cb.buf, 0x01)
     # Type ID

From 8b622217211342ff1da11ecbe3c78b3d89932581 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 00:20:52 +0530
Subject: [PATCH 13/29] rename ReduceIdentity to OperationIdentity

- Abstract type now called OperationIdentity to reflect use by both reduce and scan operations
- FloatIdentity and IntegerIdentity now inherit from OperationIdentity
- Updated comments and docs to reflect the broader scope
- Updated import in intrinsics.jl
---
 src/bytecode/writer.jl     | 18 +++++++++---------
 src/compiler/intrinsics.jl |  2 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index 78fc2b8..193cc07 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -234,18 +234,18 @@ end
 =============================================================================#
 
 """
-    ReduceIdentity
+    OperationIdentity
 
-Abstract type for reduce identity attributes.
+Abstract type for binary operation identity attributes (reduce, scan, etc.).
 """
-abstract type ReduceIdentity end
+abstract type OperationIdentity end
 
 """
     FloatIdentity(value, type_id, dtype)
 
-Float identity value for reduce operations.
+Float identity value for binary operations.
 """
-struct FloatIdentity <: ReduceIdentity
+struct FloatIdentity <: OperationIdentity
     value::Float64
     type_id::TypeId
     dtype::Type  # Float16, Float32, Float64, etc.
@@ -254,9 +254,9 @@ end
 """
     IntegerIdentity(value, type_id, dtype, signed)
 
-Integer identity value for reduce operations (add, max, mul, min, and, or, xor).
+Integer identity value for binary operations.
 """
-struct IntegerIdentity <: ReduceIdentity
+struct IntegerIdentity <: OperationIdentity
     value::Int64  # Store as signed Int64, will be reinterpreted as unsigned
     type_id::TypeId
     dtype::Type   # Int8, Int16, Int32, Int64, UInt8, etc.
@@ -335,10 +335,10 @@ end
 """
     encode_identity_array!(cb, identities)
 
-Encode an array of reduce identity attributes.
+Encode an array of binary operation identity attributes.
 Dispatches on identity type to encode correctly.
 """
-function encode_identity_array!(cb::CodeBuilder, identities::Vector{<:ReduceIdentity})
+function encode_identity_array!(cb::CodeBuilder, identities::Vector{<:OperationIdentity})
     encode_varint!(cb.buf, length(identities))
     for identity in identities
         encode_identity!(cb, identity)
diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl
index 7658c79..ef55efa 100644
--- a/src/compiler/intrinsics.jl
+++ b/src/compiler/intrinsics.jl
@@ -8,7 +8,7 @@ using Base: compilerbarrier, donotdelete
 using ..cuTile: Tile, TileArray, Constant, TensorView, PartitionView
 using ..cuTile: Signedness, SignednessSigned, SignednessUnsigned
 using ..cuTile: ComparisonPredicate, CmpLessThan, CmpLessThanOrEqual, CmpGreaterThan, CmpGreaterThanOrEqual, CmpEqual, CmpNotEqual
-using ..cuTile: ReduceIdentity, FloatIdentity, IntegerIdentity
+using ..cuTile: OperationIdentity, FloatIdentity, IntegerIdentity
 
 end
 

From 6ddecaa2ebb09b18fe58a51482df8c7d476c8084 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 00:29:54 +0530
Subject: [PATCH 14/29] rename identity types to IdentityOp hierarchy

- IdentityOp: abstract type for binary operation identities
- FloatIdentityOp: concrete type for float identities
- IntegerIdentityOp: concrete type for integer identities (with signed field)
- Applied consistently across writer.jl, encodings.jl, intrinsics.jl, and core.jl
---
 src/bytecode/encodings.jl       |  2 +-
 src/bytecode/writer.jl          | 26 +++++++++++++-------------
 src/compiler/intrinsics.jl      |  2 +-
 src/compiler/intrinsics/core.jl | 28 ++++++++++++++--------------
 4 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/src/bytecode/encodings.jl b/src/bytecode/encodings.jl
index 9f06415..9d20820 100644
--- a/src/bytecode/encodings.jl
+++ b/src/bytecode/encodings.jl
@@ -1291,7 +1291,7 @@ function encode_ReduceOp!(body::Function, cb::CodeBuilder,
                           result_types::Vector{TypeId},
                           operands::Vector{Value},
                           dim::Int,
-                          identities::Vector{<:ReduceIdentity},
+                          identities::Vector{<:IdentityOp},
                           body_scalar_types::Vector{TypeId})
     encode_varint!(cb.buf, Opcode.ReduceOp)
 
diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index 193cc07..98b32be 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -234,29 +234,29 @@ end
 =============================================================================#
 
 """
-    OperationIdentity
+    IdentityOp
 
 Abstract type for binary operation identity attributes (reduce, scan, etc.).
 """
-abstract type OperationIdentity end
+abstract type IdentityOp end
 
 """
-    FloatIdentity(value, type_id, dtype)
+    FloatIdentityOp(value, type_id, dtype)
 
 Float identity value for binary operations.
 """
-struct FloatIdentity <: OperationIdentity
+struct FloatIdentityOp <: IdentityOp
     value::Float64
     type_id::TypeId
     dtype::Type  # Float16, Float32, Float64, etc.
 end
 
 """
-    IntegerIdentity(value, type_id, dtype, signed)
+    IntegerIdentityOp(value, type_id, dtype, signed)
 
 Integer identity value for binary operations.
 """
-struct IntegerIdentity <: OperationIdentity
+struct IntegerIdentityOp <: IdentityOp
     value::Int64  # Store as signed Int64, will be reinterpreted as unsigned
     type_id::TypeId
     dtype::Type   # Int8, Int16, Int32, Int64, UInt8, etc.
@@ -264,12 +264,12 @@ struct IntegerIdentity <: OperationIdentity
 end
 
 """
-    encode_tagged_float!(cb, identity::FloatIdentity)
+    encode_tagged_float!(cb, identity::FloatIdentityOp)
 
 Encode a tagged float attribute for reduce identity.
 Format: tag(Float=0x02) + typeid + ap_int(value_bits)
 """
-function encode_tagged_float!(cb::CodeBuilder, identity::FloatIdentity)
+function encode_tagged_float!(cb::CodeBuilder, identity::FloatIdentityOp)
     # Tag for Float attribute
     push!(cb.buf, 0x02)
     # Type ID
@@ -280,12 +280,12 @@ function encode_tagged_float!(cb::CodeBuilder, identity::FloatIdentity)
 end
 
 """
-    encode_tagged_int!(cb, identity::IntegerIdentity)
+    encode_tagged_int!(cb, identity::IntegerIdentityOp)
 
 Encode a tagged integer identity attribute.
 Format: tag(Int=0x01) + typeid + ap_int(value)
 """
-function encode_tagged_int!(cb::CodeBuilder, identity::IntegerIdentity)
+function encode_tagged_int!(cb::CodeBuilder, identity::IntegerIdentityOp)
     # Tag for Int attribute
     push!(cb.buf, 0x01)
     # Type ID
@@ -338,7 +338,7 @@ end
 Encode an array of binary operation identity attributes.
 Dispatches on identity type to encode correctly.
 """
-function encode_identity_array!(cb::CodeBuilder, identities::Vector{<:OperationIdentity})
+function encode_identity_array!(cb::CodeBuilder, identities::Vector{<:IdentityOp})
     encode_varint!(cb.buf, length(identities))
     for identity in identities
         encode_identity!(cb, identity)
@@ -350,8 +350,8 @@ end
 
 Encode a single identity attribute, dispatching on type.
 """
-encode_identity!(cb::CodeBuilder, identity::FloatIdentity) = encode_tagged_float!(cb, identity)
-encode_identity!(cb::CodeBuilder, identity::IntegerIdentity) = encode_tagged_int!(cb, identity)
+encode_identity!(cb::CodeBuilder, identity::FloatIdentityOp) = encode_tagged_float!(cb, identity)
+encode_identity!(cb::CodeBuilder, identity::IntegerIdentityOp) = encode_tagged_int!(cb, identity)
 
 """
     BytecodeWriter
diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl
index ef55efa..e522141 100644
--- a/src/compiler/intrinsics.jl
+++ b/src/compiler/intrinsics.jl
@@ -8,7 +8,7 @@ using Base: compilerbarrier, donotdelete
 using ..cuTile: Tile, TileArray, Constant, TensorView, PartitionView
 using ..cuTile: Signedness, SignednessSigned, SignednessUnsigned
 using ..cuTile: ComparisonPredicate, CmpLessThan, CmpLessThanOrEqual, CmpGreaterThan, CmpGreaterThanOrEqual, CmpEqual, CmpNotEqual
-using ..cuTile: OperationIdentity, FloatIdentity, IntegerIdentity
+using ..cuTile: IdentityOp, FloatIdentityOp, IntegerIdentityOp
 
 end
 
diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index cc15c25..47608b7 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -662,46 +662,46 @@ is_signed(::Type{T}) where T <: Integer = T <: SignedInteger
 is_signed(::Type{T}) where T <: AbstractFloat = false
 
 """
-    reduce_identity(reduce_fn, dtype, elem_type) -> ReduceIdentity
+    reduce_identity(reduce_fn, dtype, elem_type) -> IdentityOp
 
-Return the identity value for a reduction operation.
-Identity must satisfy: identity ⊕ x = x for the reduction operation.
+Return the identity value for a binary operation (reduce, scan, etc.).
+Identity must satisfy: identity ⊕ x = x for the operation.
 """
 # Addition identity: 0 + x = x
 reduce_identity(::Val{:add}, dtype, ::Type{T}) where T <: AbstractFloat =
-    FloatIdentity(0.0, dtype, T)
+    FloatIdentityOp(0.0, dtype, T)
 reduce_identity(::Val{:add}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentity(0, dtype, T, is_signed(T))
+    IntegerIdentityOp(0, dtype, T, is_signed(T))
 
 # Maximum identity: max(-Inf, x) = x
 reduce_identity(::Val{:max}, dtype, ::Type{T}) where T <: AbstractFloat =
-    FloatIdentity(-Inf, dtype, T)
+    FloatIdentityOp(-Inf, dtype, T)
 reduce_identity(::Val{:max}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentity(0, dtype, T, is_signed(T))  # For integers, use 0 as identity (max(0, x) = x)
+    IntegerIdentityOp(0, dtype, T, is_signed(T))  # For integers, use 0 as identity (max(0, x) = x)
 
 # Multiplication identity: 1 * x = x
 reduce_identity(::Val{:mul}, dtype, ::Type{T}) where T <: AbstractFloat =
-    FloatIdentity(1.0, dtype, T)
+    FloatIdentityOp(1.0, dtype, T)
 reduce_identity(::Val{:mul}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentity(1, dtype, T, is_signed(T))
+    IntegerIdentityOp(1, dtype, T, is_signed(T))
 
 # Minimum identity: min(+Inf, x) = x
 reduce_identity(::Val{:min}, dtype, ::Type{T}) where T <: AbstractFloat =
-    FloatIdentity(+Inf, dtype, T)
+    FloatIdentityOp(+Inf, dtype, T)
 reduce_identity(::Val{:min}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentity(typemax(Int64), dtype, T, is_signed(T))  # Use max int as +Inf proxy
+    IntegerIdentityOp(typemax(Int64), dtype, T, is_signed(T))  # Use max int as +Inf proxy
 
 # AND identity: all bits set (x & -1 == x)
 reduce_identity(::Val{:and}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentity(-1, dtype, T, is_signed(T))  # All bits set
+    IntegerIdentityOp(-1, dtype, T, is_signed(T))  # All bits set
 
 # OR identity: 0 | x = x
 reduce_identity(::Val{:or}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentity(0, dtype, T, is_signed(T))
+    IntegerIdentityOp(0, dtype, T, is_signed(T))
 
 # XOR identity: 0 ⊕ x = x
 reduce_identity(::Val{:xor}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentity(0, dtype, T, is_signed(T))
+    IntegerIdentityOp(0, dtype, T, is_signed(T))
 
 #=============================================================================
  Reduce Body Operations - dispatch on Val{fn} and elem_type

From 9fc8d0a75b14e645d9b8ae5109a4093398473abf Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 01:13:05 +0530
Subject: [PATCH 15/29] fix is_signed to use proper Julia type hierarchy check

- T <: Integer && !(T <: Unsigned) correctly identifies:
  - Int32, Int64, etc. as signed (true)
  - UInt32, UInt64, etc. as unsigned (false)
---
 src/compiler/intrinsics/core.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index 47608b7..e7c5b24 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -658,7 +658,7 @@ end
 
 Return true if type T is signed, false for unsigned types.
 """
-is_signed(::Type{T}) where T <: Integer = T <: SignedInteger
+is_signed(::Type{T}) where T <: Integer = T <: Integer && !(T <: Unsigned)
 is_signed(::Type{T}) where T <: AbstractFloat = false
 
 """

From 00f0de9af01f67e38726b52e110be4d19e548c1a Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 01:20:57 +0530
Subject: [PATCH 16/29] intrinsics: use -one(T) instead of -1 for signed AND
 identity

Ensures type-consistent encoding for Int8, Int16, etc.

intrinsics: use type-dependent identity values for reduce ops

- add: zero(T)
- max: typemin(T)
- mul: one(T)
- min: typemax(T)
- and: is_signed ? -1 : typemax(T) for proper bit representation
- or, xor: zero(T)

Fixes encoding error for UInt32 (9223372036854775807 does not fit in 32 bits)

Update core.jl

fix reduce_min identity to use typemax(T) instead of typemax(Int64)

- For UInt32, typemax(UInt32) = 4294967295 fits in 32 bits
- typemax(Int64) = 9223372036854775807 does not fit and caused encoding error
---
 src/compiler/intrinsics/core.jl | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index e7c5b24..dc9d777 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -669,39 +669,41 @@ Identity must satisfy: identity ⊕ x = x for the operation.
 """
 # Addition identity: 0 + x = x
 reduce_identity(::Val{:add}, dtype, ::Type{T}) where T <: AbstractFloat =
-    FloatIdentityOp(0.0, dtype, T)
+    FloatIdentityOp(zero(T), dtype, T)
 reduce_identity(::Val{:add}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentityOp(0, dtype, T, is_signed(T))
+    IntegerIdentityOp(zero(T), dtype, T, is_signed(T))
 
-# Maximum identity: max(-Inf, x) = x
+# Maximum identity: max(typemin(T), x) = x
 reduce_identity(::Val{:max}, dtype, ::Type{T}) where T <: AbstractFloat =
-    FloatIdentityOp(-Inf, dtype, T)
+    FloatIdentityOp(typemin(T), dtype, T)
 reduce_identity(::Val{:max}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentityOp(0, dtype, T, is_signed(T))  # For integers, use 0 as identity (max(0, x) = x)
+    IntegerIdentityOp(typemin(T), dtype, T, is_signed(T))
 
 # Multiplication identity: 1 * x = x
 reduce_identity(::Val{:mul}, dtype, ::Type{T}) where T <: AbstractFloat =
-    FloatIdentityOp(1.0, dtype, T)
+    FloatIdentityOp(one(T), dtype, T)
 reduce_identity(::Val{:mul}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentityOp(1, dtype, T, is_signed(T))
+    IntegerIdentityOp(one(T), dtype, T, is_signed(T))
 
-# Minimum identity: min(+Inf, x) = x
+# Minimum identity: min(typemax(T), x) = x
 reduce_identity(::Val{:min}, dtype, ::Type{T}) where T <: AbstractFloat =
-    FloatIdentityOp(+Inf, dtype, T)
+    FloatIdentityOp(typemax(T), dtype, T)
 reduce_identity(::Val{:min}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentityOp(typemax(Int64), dtype, T, is_signed(T))  # Use max int as +Inf proxy
+    IntegerIdentityOp(typemax(T), dtype, T, is_signed(T))
 
-# AND identity: all bits set (x & -1 == x)
+# AND identity: all bits set (x & identity == x)
+# For signed: -one(T) has all bits set in two's complement
+# For unsigned: typemax(T) has all bits set
 reduce_identity(::Val{:and}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentityOp(-1, dtype, T, is_signed(T))  # All bits set
+    IntegerIdentityOp(is_signed(T) ? -one(T) : typemax(T), dtype, T, is_signed(T))
 
 # OR identity: 0 | x = x
 reduce_identity(::Val{:or}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentityOp(0, dtype, T, is_signed(T))
+    IntegerIdentityOp(zero(T), dtype, T, is_signed(T))
 
 # XOR identity: 0 ⊕ x = x
 reduce_identity(::Val{:xor}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentityOp(0, dtype, T, is_signed(T))
+    IntegerIdentityOp(zero(T), dtype, T, is_signed(T))
 
 #=============================================================================
  Reduce Body Operations - dispatch on Val{fn} and elem_type

From 55b601e6f4262c7b947840bcde966242494b4e8f Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 02:22:31 +0530
Subject: [PATCH 17/29] test: restore codegen and types tests, fix reduce_ops
 reference

test: add comprehensive reduce operations tests

- Tests for all reduce ops: add, mul, min, max, and, or, xor
- Tests for Float32, Float64, Int32, UInt32, Int8 types
- Tests for axis 0 and axis 1 reductions
- Compares GPU results against CPU reference implementations
- Includes UInt32 and Int8 tests for identity encoding fix
---
 test/reduce_ops.jl | 749 +++++++++++++++++++++++++++++++++++++++++++++
 test/runtests.jl   |   2 +-
 2 files changed, 750 insertions(+), 1 deletion(-)
 create mode 100644 test/reduce_ops.jl

diff --git a/test/reduce_ops.jl b/test/reduce_ops.jl
new file mode 100644
index 0000000..762f37e
--- /dev/null
+++ b/test/reduce_ops.jl
@@ -0,0 +1,749 @@
+using cuTile
+import cuTile as ct
+using CUDA
+using Test
+
+@testset "reduce operations" begin
+
+#======================================================================
+CPU reference implementations
+======================================================================
+
+cpu_reduce_add(a::AbstractArray, dims::Integer) = sum(a, dims=dims)
+cpu_reduce_mul(a::AbstractArray, dims::Integer) = prod(a, dims=dims)
+cpu_reduce_max(a::AbstractArray, dims::Integer) = maximum(a, dims=dims)
+cpu_reduce_min(a::AbstractArray, dims::Integer) = minimum(a, dims=dims)
+
+cpu_reduce_and(a::AbstractArray{<:Integer}, dims::Integer) = reduce((x, y) -> x & y, a, dims=dims)
+cpu_reduce_or(a::AbstractArray{<:Integer}, dims::Integer) = reduce((x, y) -> x | y, a, dims=dims)
+cpu_reduce_xor(a::AbstractArray{<:Integer}, dims::Integer) = reduce((x, y) -> x ⊻ y, a, dims=dims)
+
+#======================================================================
+Float32 operations
+======================================================================
+
+@testset "Float32 reduce_add" begin
+    function reduce_add_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 128))
+        sums = ct.reduce_sum(tile, 2)
+        ct.store(b, pid, sums)
+        return
+    end
+
+    m, n = 64, 128
+    a = CUDA.rand(Float32, m, n)
+    b = CUDA.zeros(Float32, m)
+
+    ct.launch(reduce_add_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] ≈ cpu_reduce_add(a_cpu[i:i, :], 2)[1] rtol=1e-3
+    end
+end
+
+@testset "Float32 reduce_mul" begin
+    function reduce_mul_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 32))
+        products = ct.reduce_mul(tile, 2)
+        ct.store(b, pid, products)
+        return
+    end
+
+    m, n = 32, 64
+    a = CUDA.rand(Float32, m, n) .+ 0.1f0
+    b = CUDA.ones(Float32, m)
+
+    ct.launch(reduce_mul_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] ≈ cpu_reduce_mul(a_cpu[i:i, :], 2)[1] rtol=1e-2
+    end
+end
+
+@testset "Float32 reduce_max" begin
+    function reduce_max_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 128))
+        maxes = ct.reduce_max(tile, 2)
+        ct.store(b, pid, maxes)
+        return
+    end
+
+    m, n = 64, 128
+    a = CUDA.rand(Float32, m, n)
+    b = CUDA.zeros(Float32, m)
+
+    ct.launch(reduce_max_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] ≈ cpu_reduce_max(a_cpu[i:i, :], 2)[1] rtol=1e-5
+    end
+end
+
+@testset "Float32 reduce_min" begin
+    function reduce_min_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 128))
+        mins = ct.reduce_min(tile, 2)
+        ct.store(b, pid, mins)
+        return
+    end
+
+    m, n = 64, 128
+    a = CUDA.rand(Float32, m, n)
+    b = CUDA.zeros(Float32, m)
+
+    ct.launch(reduce_min_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] ≈ cpu_reduce_min(a_cpu[i:i, :], 2)[1] rtol=1e-5
+    end
+end
+
+#======================================================================
+Float64 operations
+======================================================================
+
+@testset "Float64 reduce_add" begin
+    function reduce_add_f64_kernel(a::ct.TileArray{Float64,2}, b::ct.TileArray{Float64,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 64))
+        sums = ct.reduce_sum(tile, 2)
+        ct.store(b, pid, sums)
+        return
+    end
+
+    m, n = 32, 64
+    a = CUDA.rand(Float64, m, n)
+    b = CUDA.zeros(Float64, m)
+
+    ct.launch(reduce_add_f64_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] ≈ cpu_reduce_add(a_cpu[i:i, :], 2)[1] rtol=1e-5
+    end
+end
+
+@testset "Float64 reduce_max" begin
+    function reduce_max_f64_kernel(a::ct.TileArray{Float64,2}, b::ct.TileArray{Float64,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 64))
+        maxes = ct.reduce_max(tile, 2)
+        ct.store(b, pid, maxes)
+        return
+    end
+
+    m, n = 32, 64
+    a = CUDA.rand(Float64, m, n)
+    b = CUDA.zeros(Float64, m)
+
+    ct.launch(reduce_max_f64_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] ≈ cpu_reduce_max(a_cpu[i:i, :], 2)[1] rtol=1e-5
+    end
+end
+
+@testset "Float64 reduce_min" begin
+    function reduce_min_f64_kernel(a::ct.TileArray{Float64,2}, b::ct.TileArray{Float64,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 64))
+        mins = ct.reduce_min(tile, 2)
+        ct.store(b, pid, mins)
+        return
+    end
+
+    m, n = 32, 64
+    a = CUDA.rand(Float64, m, n)
+    b = CUDA.zeros(Float64, m)
+
+    ct.launch(reduce_min_f64_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] ≈ cpu_reduce_min(a_cpu[i:i, :], 2)[1] rtol=1e-5
+    end
+end
+
+@testset "Float64 reduce_mul" begin
+    function reduce_mul_f64_kernel(a::ct.TileArray{Float64,2}, b::ct.TileArray{Float64,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 32))
+        products = ct.reduce_mul(tile, 2)
+        ct.store(b, pid, products)
+        return
+    end
+
+    m, n = 16, 32
+    a = CUDA.rand(Float64, m, n) .+ 0.1
+    b = CUDA.ones(Float64, m)
+
+    ct.launch(reduce_mul_f64_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] ≈ cpu_reduce_mul(a_cpu[i:i, :], 2)[1] rtol=1e-2
+    end
+end
+
+#======================================================================
+Int32 operations
+======================================================================
+
+@testset "Int32 reduce_add" begin
+    function reduce_add_i32_kernel(a::ct.TileArray{Int32,2}, b::ct.TileArray{Int32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 64))
+        sums = ct.reduce_sum(tile, 2)
+        ct.store(b, pid, sums)
+        return
+    end
+
+    m, n = 32, 64
+    a = CUDA.rand(Int32, m, n) .+ 1
+    b = CUDA.zeros(Int32, m)
+
+    ct.launch(reduce_add_i32_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_add(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "Int32 reduce_mul" begin
+    function reduce_mul_i32_kernel(a::ct.TileArray{Int32,2}, b::ct.TileArray{Int32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 16))
+        products = ct.reduce_mul(tile, 2)
+        ct.store(b, pid, products)
+        return
+    end
+
+    m, n = 8, 16
+    a = CUDA.rand(Int32, m, n) .% 10 .+ 2
+    b = CUDA.ones(Int32, m)
+
+    ct.launch(reduce_mul_i32_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_mul(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "Int32 reduce_max" begin
+    function reduce_max_i32_kernel(a::ct.TileArray{Int32,2}, b::ct.TileArray{Int32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 64))
+        maxes = ct.reduce_max(tile, 2)
+        ct.store(b, pid, maxes)
+        return
+    end
+
+    m, n = 32, 64
+    a = CUDA.rand(Int32, m, n)
+    b = CUDA.fill(typemin(Int32), m)
+
+    ct.launch(reduce_max_i32_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_max(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "Int32 reduce_min" begin
+    function reduce_min_i32_kernel(a::ct.TileArray{Int32,2}, b::ct.TileArray{Int32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 64))
+        mins = ct.reduce_min(tile, 2)
+        ct.store(b, pid, mins)
+        return
+    end
+
+    m, n = 32, 64
+    a = CUDA.rand(Int32, m, n)
+    b = CUDA.fill(typemax(Int32), m)
+
+    ct.launch(reduce_min_i32_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_min(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "Int32 reduce_and" begin
+    function reduce_and_i32_kernel(a::ct.TileArray{Int32,2}, b::ct.TileArray{Int32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 32))
+        result = ct.reduce_and(tile, 2)
+        ct.store(b, pid, result)
+        return
+    end
+
+    m, n = 16, 32
+    a = CUDA.rand(Int32, m, n)
+    b = CUDA.zeros(Int32, m)
+
+    ct.launch(reduce_and_i32_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_and(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "Int32 reduce_or" begin
+    function reduce_or_i32_kernel(a::ct.TileArray{Int32,2}, b::ct.TileArray{Int32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 32))
+        result = ct.reduce_or(tile, 2)
+        ct.store(b, pid, result)
+        return
+    end
+
+    m, n = 16, 32
+    a = CUDA.rand(Int32, m, n)
+    b = CUDA.zeros(Int32, m)
+
+    ct.launch(reduce_or_i32_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_or(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "Int32 reduce_xor" begin
+    function reduce_xor_i32_kernel(a::ct.TileArray{Int32,2}, b::ct.TileArray{Int32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 32))
+        result = ct.reduce_xor(tile, 2)
+        ct.store(b, pid, result)
+        return
+    end
+
+    m, n = 16, 32
+    a = CUDA.rand(Int32, m, n)
+    b = CUDA.zeros(Int32, m)
+
+    ct.launch(reduce_xor_i32_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_xor(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+#======================================================================
+UInt32 operations - tests AND identity encoding fix
+======================================================================
+
+@testset "UInt32 reduce_add" begin
+    function reduce_add_u32_kernel(a::ct.TileArray{UInt32,2}, b::ct.TileArray{UInt32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 64))
+        sums = ct.reduce_sum(tile, 2)
+        ct.store(b, pid, sums)
+        return
+    end
+
+    m, n = 32, 64
+    a = CUDA.rand(UInt32, m, n)
+    b = CUDA.zeros(UInt32, m)
+
+    ct.launch(reduce_add_u32_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_add(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "UInt32 reduce_mul" begin
+    function reduce_mul_u32_kernel(a::ct.TileArray{UInt32,2}, b::ct.TileArray{UInt32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 16))
+        products = ct.reduce_mul(tile, 2)
+        ct.store(b, pid, products)
+        return
+    end
+
+    m, n = 8, 16
+    a = CUDA.rand(UInt32, m, n) .% 10 .+ 2
+    b = CUDA.ones(UInt32, m)
+
+    ct.launch(reduce_mul_u32_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_mul(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "UInt32 reduce_max" begin
+    function reduce_max_u32_kernel(a::ct.TileArray{UInt32,2}, b::ct.TileArray{UInt32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 64))
+        maxes = ct.reduce_max(tile, 2)
+        ct.store(b, pid, maxes)
+        return
+    end
+
+    m, n = 32, 64
+    a = CUDA.rand(UInt32, m, n)
+    b = CUDA.zeros(UInt32, m)
+
+    ct.launch(reduce_max_u32_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_max(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "UInt32 reduce_min" begin
+    function reduce_min_u32_kernel(a::ct.TileArray{UInt32,2}, b::ct.TileArray{UInt32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 64))
+        mins = ct.reduce_min(tile, 2)
+        ct.store(b, pid, mins)
+        return
+    end
+
+    m, n = 32, 64
+    a = CUDA.rand(UInt32, m, n)
+    b = CUDA.fill(typemax(UInt32), m)
+
+    ct.launch(reduce_min_u32_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_min(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "UInt32 reduce_and" begin
+    function reduce_and_u32_kernel(a::ct.TileArray{UInt32,2}, b::ct.TileArray{UInt32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 32))
+        result = ct.reduce_and(tile, 2)
+        ct.store(b, pid, result)
+        return
+    end
+
+    m, n = 16, 32
+    a = CUDA.rand(UInt32, m, n)
+    b = CUDA.zeros(UInt32, m)
+
+    ct.launch(reduce_and_u32_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_and(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "UInt32 reduce_or" begin
+    function reduce_or_u32_kernel(a::ct.TileArray{UInt32,2}, b::ct.TileArray{UInt32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 32))
+        result = ct.reduce_or(tile, 2)
+        ct.store(b, pid, result)
+        return
+    end
+
+    m, n = 16, 32
+    a = CUDA.rand(UInt32, m, n)
+    b = CUDA.zeros(UInt32, m)
+
+    ct.launch(reduce_or_u32_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_or(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "UInt32 reduce_xor" begin
+    function reduce_xor_u32_kernel(a::ct.TileArray{UInt32,2}, b::ct.TileArray{UInt32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 32))
+        result = ct.reduce_xor(tile, 2)
+        ct.store(b, pid, result)
+        return
+    end
+
+    m, n = 16, 32
+    a = CUDA.rand(UInt32, m, n)
+    b = CUDA.zeros(UInt32, m)
+
+    ct.launch(reduce_xor_u32_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_xor(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+#======================================================================
+Int8 operations - smaller integer type for encoding tests
+======================================================================
+
+@testset "Int8 reduce_add" begin
+    function reduce_add_i8_kernel(a::ct.TileArray{Int8,2}, b::ct.TileArray{Int8,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 32))
+        sums = ct.reduce_sum(tile, 2)
+        ct.store(b, pid, sums)
+        return
+    end
+
+    m, n = 16, 32
+    a = CUDA.rand(Int8, m, n)
+    b = CUDA.zeros(Int8, m)
+
+    ct.launch(reduce_add_i8_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test Int32(b_cpu[i]) == cpu_reduce_add(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "Int8 reduce_max" begin
+    function reduce_max_i8_kernel(a::ct.TileArray{Int8,2}, b::ct.TileArray{Int8,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 32))
+        maxes = ct.reduce_max(tile, 2)
+        ct.store(b, pid, maxes)
+        return
+    end
+
+    m, n = 16, 32
+    a = CUDA.rand(Int8, m, n)
+    b = CUDA.fill(typemin(Int8), m)
+
+    ct.launch(reduce_max_i8_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_max(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "Int8 reduce_min" begin
+    function reduce_min_i8_kernel(a::ct.TileArray{Int8,2}, b::ct.TileArray{Int8,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 32))
+        mins = ct.reduce_min(tile, 2)
+        ct.store(b, pid, mins)
+        return
+    end
+
+    m, n = 16, 32
+    a = CUDA.rand(Int8, m, n)
+    b = CUDA.fill(typemax(Int8), m)
+
+    ct.launch(reduce_min_i8_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test b_cpu[i] == cpu_reduce_min(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "Int8 reduce_and" begin
+    function reduce_and_i8_kernel(a::ct.TileArray{Int8,2}, b::ct.TileArray{Int8,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 16))
+        result = ct.reduce_and(tile, 2)
+        ct.store(b, pid, result)
+        return
+    end
+
+    m, n = 8, 16
+    a = CUDA.rand(Int8, m, n)
+    b = CUDA.zeros(Int8, m)
+
+    ct.launch(reduce_and_i8_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test Int32(b_cpu[i]) == cpu_reduce_and(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "Int8 reduce_or" begin
+    function reduce_or_i8_kernel(a::ct.TileArray{Int8,2}, b::ct.TileArray{Int8,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 16))
+        result = ct.reduce_or(tile, 2)
+        ct.store(b, pid, result)
+        return
+    end
+
+    m, n = 8, 16
+    a = CUDA.rand(Int8, m, n)
+    b = CUDA.zeros(Int8, m)
+
+    ct.launch(reduce_or_i8_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test Int32(b_cpu[i]) == cpu_reduce_or(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+@testset "Int8 reduce_xor" begin
+    function reduce_xor_i8_kernel(a::ct.TileArray{Int8,2}, b::ct.TileArray{Int8,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (1, 16))
+        result = ct.reduce_xor(tile, 2)
+        ct.store(b, pid, result)
+        return
+    end
+
+    m, n = 8, 16
+    a = CUDA.rand(Int8, m, n)
+    b = CUDA.zeros(Int8, m)
+
+    ct.launch(reduce_xor_i8_kernel, m, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for i in 1:m
+        @test Int32(b_cpu[i]) == cpu_reduce_xor(a_cpu[i:i, :], 2)[1]
+    end
+end
+
+#======================================================================
+Axis 0 reductions - verify both axes work
+======================================================================
+
+@testset "axis 0 reduce_sum Float32" begin
+    function reduce_sum_axis0_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (1, pid), (64, 1))
+        sums = ct.reduce_sum(tile, 1)
+        ct.store(b, pid, sums)
+        return
+    end
+
+    m, n = 64, 128
+    a = CUDA.rand(Float32, m, n)
+    b = CUDA.zeros(Float32, n)
+
+    ct.launch(reduce_sum_axis0_kernel, n, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for j in 1:n
+        @test b_cpu[j] ≈ cpu_reduce_add(a_cpu[:, j:j], 1)[1] rtol=1e-3
+    end
+end
+
+@testset "axis 0 reduce_min Int32" begin
+    function reduce_min_axis0_i32_kernel(a::ct.TileArray{Int32,2}, b::ct.TileArray{Int32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (1, pid), (32, 1))
+        mins = ct.reduce_min(tile, 1)
+        ct.store(b, pid, mins)
+        return
+    end
+
+    m, n = 32, 64
+    a = CUDA.rand(Int32, m, n)
+    b = CUDA.fill(typemax(Int32), n)
+
+    ct.launch(reduce_min_axis0_i32_kernel, n, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for j in 1:n
+        @test b_cpu[j] == cpu_reduce_min(a_cpu[:, j:j], 1)[1]
+    end
+end
+
+@testset "axis 0 reduce_max UInt32" begin
+    function reduce_max_axis0_u32_kernel(a::ct.TileArray{UInt32,2}, b::ct.TileArray{UInt32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (1, pid), (32, 1))
+        maxes = ct.reduce_max(tile, 1)
+        ct.store(b, pid, maxes)
+        return
+    end
+
+    m, n = 32, 64
+    a = CUDA.rand(UInt32, m, n)
+    b = CUDA.zeros(UInt32, n)
+
+    ct.launch(reduce_max_axis0_u32_kernel, n, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for j in 1:n
+        @test b_cpu[j] == cpu_reduce_max(a_cpu[:, j:j], 1)[1]
+    end
+end
+
+@testset "axis 0 reduce_and UInt32" begin
+    function reduce_and_axis0_u32_kernel(a::ct.TileArray{UInt32,2}, b::ct.TileArray{UInt32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, (1, pid), (16, 1))
+        result = ct.reduce_and(tile, 1)
+        ct.store(b, pid, result)
+        return
+    end
+
+    m, n = 16, 32
+    a = CUDA.rand(UInt32, m, n)
+    b = CUDA.fill(typemax(UInt32), n)
+
+    ct.launch(reduce_and_axis0_u32_kernel, n, a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    for j in 1:n
+        @test b_cpu[j] == cpu_reduce_and(a_cpu[:, j:j], 1)[1]
+    end
+end
+
+end  # @testset "reduce operations"
diff --git a/test/runtests.jl b/test/runtests.jl
index 6b9aed9..a52163a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -45,7 +45,7 @@ if filter_tests!(testsuite, args)
 
     cuda_functional = CUDA.functional()
     filter!(testsuite) do (test, _)
-        if in(test, ["execution"]) || startswith(test, "examples/")
+        if in(test, ["execution", "reduce_ops"]) || startswith(test, "examples/")
             return cuda_functional
         else
             return true

From 959daa4f86526c1169c0fa31dd4d2d17b2304a82 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 02:33:52 +0530
Subject: [PATCH 18/29] multiline comment mess in reduce_ops.jl

used agent to create tests and hence the wrath.
---
 test/reduce_ops.jl | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/test/reduce_ops.jl b/test/reduce_ops.jl
index 762f37e..a352c69 100644
--- a/test/reduce_ops.jl
+++ b/test/reduce_ops.jl
@@ -5,9 +5,9 @@ using Test
 
 @testset "reduce operations" begin
 
-#======================================================================
-CPU reference implementations
-======================================================================
+#======================================================================#
+# CPU reference implementations
+# =====================================================================#
 
 cpu_reduce_add(a::AbstractArray, dims::Integer) = sum(a, dims=dims)
 cpu_reduce_mul(a::AbstractArray, dims::Integer) = prod(a, dims=dims)
@@ -18,9 +18,9 @@ cpu_reduce_and(a::AbstractArray{<:Integer}, dims::Integer) = reduce((x, y) -> x
 cpu_reduce_or(a::AbstractArray{<:Integer}, dims::Integer) = reduce((x, y) -> x | y, a, dims=dims)
 cpu_reduce_xor(a::AbstractArray{<:Integer}, dims::Integer) = reduce((x, y) -> x ⊻ y, a, dims=dims)
 
-#======================================================================
-Float32 operations
-======================================================================
+#======================================================================#
+# Float32 operations
+#======================================================================#
 
 @testset "Float32 reduce_add" begin
     function reduce_add_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1})
@@ -110,9 +110,9 @@ end
     end
 end
 
-#======================================================================
-Float64 operations
-======================================================================
+#======================================================================#
+# Float64 operations
+#======================================================================#
 
 @testset "Float64 reduce_add" begin
     function reduce_add_f64_kernel(a::ct.TileArray{Float64,2}, b::ct.TileArray{Float64,1})
@@ -202,9 +202,9 @@ end
     end
 end
 
-#======================================================================
-Int32 operations
-======================================================================
+#======================================================================#
+# Int32 operations
+#======================================================================#
 
 @testset "Int32 reduce_add" begin
     function reduce_add_i32_kernel(a::ct.TileArray{Int32,2}, b::ct.TileArray{Int32,1})
@@ -360,9 +360,9 @@ end
     end
 end
 
-#======================================================================
-UInt32 operations - tests AND identity encoding fix
-======================================================================
+#======================================================================#
+# UInt32 operations - tests AND identity encoding fix
+#======================================================================#
 
 @testset "UInt32 reduce_add" begin
     function reduce_add_u32_kernel(a::ct.TileArray{UInt32,2}, b::ct.TileArray{UInt32,1})
@@ -518,9 +518,9 @@ end
     end
 end
 
-#======================================================================
-Int8 operations - smaller integer type for encoding tests
-======================================================================
+#======================================================================#
+# Int8 operations - smaller integer type for encoding tests
+#======================================================================#
 
 @testset "Int8 reduce_add" begin
     function reduce_add_i8_kernel(a::ct.TileArray{Int8,2}, b::ct.TileArray{Int8,1})
@@ -654,9 +654,9 @@ end
     end
 end
 
-#======================================================================
-Axis 0 reductions - verify both axes work
-======================================================================
+#======================================================================#
+# Axis 0 reductions - verify both axes work
+#======================================================================#
 
 @testset "axis 0 reduce_sum Float32" begin
     function reduce_sum_axis0_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1})

From 9b0418f2af8f01920dd801930ef448f7022e8285 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 02:55:14 +0530
Subject: [PATCH 19/29] intrinsics: rename reduce_identity ->
 operation_identity

Prepares for reuse by scan operations. Function is shape-agnostic
and depends only on operation type and element type.
---
 src/compiler/intrinsics/core.jl | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index dc9d777..6ac2233 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -636,7 +636,7 @@ function emit_reduce!(ctx::CGCtx, args, reduce_fn::Symbol)
     scalar_tile_type = tile_type!(tt, dtype, Int[])
 
     # Create identity value via dispatch on reduction function and element type
-    identity = reduce_identity(Val(reduce_fn), dtype, elem_type)
+    identity = operation_identity(Val(reduce_fn), dtype, elem_type)
 
     # Emit ReduceOp
     results = encode_ReduceOp!(cb, [output_tile_type], [input_tv.v], axis, [identity], [scalar_tile_type]) do block_args
@@ -662,47 +662,47 @@ is_signed(::Type{T}) where T <: Integer = T <: Integer && !(T <: Unsigned)
 is_signed(::Type{T}) where T <: AbstractFloat = false
 
 """
-    reduce_identity(reduce_fn, dtype, elem_type) -> IdentityOp
+    operation_identity(fn, dtype, elem_type) -> IdentityOp
 
 Return the identity value for a binary operation (reduce, scan, etc.).
 Identity must satisfy: identity ⊕ x = x for the operation.
 """
 # Addition identity: 0 + x = x
-reduce_identity(::Val{:add}, dtype, ::Type{T}) where T <: AbstractFloat =
+operation_identity(::Val{:add}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentityOp(zero(T), dtype, T)
-reduce_identity(::Val{:add}, dtype, ::Type{T}) where T <: Integer =
+operation_identity(::Val{:add}, dtype, ::Type{T}) where T <: Integer =
     IntegerIdentityOp(zero(T), dtype, T, is_signed(T))
 
 # Maximum identity: max(typemin(T), x) = x
-reduce_identity(::Val{:max}, dtype, ::Type{T}) where T <: AbstractFloat =
+operation_identity(::Val{:max}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentityOp(typemin(T), dtype, T)
-reduce_identity(::Val{:max}, dtype, ::Type{T}) where T <: Integer =
+operation_identity(::Val{:max}, dtype, ::Type{T}) where T <: Integer =
     IntegerIdentityOp(typemin(T), dtype, T, is_signed(T))
 
 # Multiplication identity: 1 * x = x
-reduce_identity(::Val{:mul}, dtype, ::Type{T}) where T <: AbstractFloat =
+operation_identity(::Val{:mul}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentityOp(one(T), dtype, T)
-reduce_identity(::Val{:mul}, dtype, ::Type{T}) where T <: Integer =
+operation_identity(::Val{:mul}, dtype, ::Type{T}) where T <: Integer =
     IntegerIdentityOp(one(T), dtype, T, is_signed(T))
 
 # Minimum identity: min(typemax(T), x) = x
-reduce_identity(::Val{:min}, dtype, ::Type{T}) where T <: AbstractFloat =
+operation_identity(::Val{:min}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentityOp(typemax(T), dtype, T)
-reduce_identity(::Val{:min}, dtype, ::Type{T}) where T <: Integer =
+operation_identity(::Val{:min}, dtype, ::Type{T}) where T <: Integer =
     IntegerIdentityOp(typemax(T), dtype, T, is_signed(T))
 
 # AND identity: all bits set (x & identity == x)
 # For signed: -one(T) has all bits set in two's complement
 # For unsigned: typemax(T) has all bits set
-reduce_identity(::Val{:and}, dtype, ::Type{T}) where T <: Integer =
+operation_identity(::Val{:and}, dtype, ::Type{T}) where T <: Integer =
     IntegerIdentityOp(is_signed(T) ? -one(T) : typemax(T), dtype, T, is_signed(T))
 
 # OR identity: 0 | x = x
-reduce_identity(::Val{:or}, dtype, ::Type{T}) where T <: Integer =
+operation_identity(::Val{:or}, dtype, ::Type{T}) where T <: Integer =
     IntegerIdentityOp(zero(T), dtype, T, is_signed(T))
 
 # XOR identity: 0 ⊕ x = x
-reduce_identity(::Val{:xor}, dtype, ::Type{T}) where T <: Integer =
+operation_identity(::Val{:xor}, dtype, ::Type{T}) where T <: Integer =
     IntegerIdentityOp(zero(T), dtype, T, is_signed(T))
 
 #=============================================================================

From 47a30bc0080f6be446ddb9530695c42f8097b7dd Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 03:03:36 +0530
Subject: [PATCH 20/29] test: fix CPU reference functions for bitwise ops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Julia's reduce with dims= requires explicit init for &,|,⊻ operators.
Use typemax(T) for AND (identity with all bits set).
---
 test/reduce_ops.jl | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/test/reduce_ops.jl b/test/reduce_ops.jl
index a352c69..36dc9ee 100644
--- a/test/reduce_ops.jl
+++ b/test/reduce_ops.jl
@@ -14,9 +14,10 @@ cpu_reduce_mul(a::AbstractArray, dims::Integer) = prod(a, dims=dims)
 cpu_reduce_max(a::AbstractArray, dims::Integer) = maximum(a, dims=dims)
 cpu_reduce_min(a::AbstractArray, dims::Integer) = minimum(a, dims=dims)
 
-cpu_reduce_and(a::AbstractArray{<:Integer}, dims::Integer) = reduce((x, y) -> x & y, a, dims=dims)
-cpu_reduce_or(a::AbstractArray{<:Integer}, dims::Integer) = reduce((x, y) -> x | y, a, dims=dims)
-cpu_reduce_xor(a::AbstractArray{<:Integer}, dims::Integer) = reduce((x, y) -> x ⊻ y, a, dims=dims)
+cpu_reduce_and(a::AbstractArray{<:Unsigned}, dims::Integer) = reduce((x, y) -> x & y, a, init=typemax(eltype(a)), dims=dims)
+cpu_reduce_and(a::AbstractArray{<:Signed}, dims::Integer) = reduce((x, y) -> x & y, a, init=Int64(-1), dims=dims)
+cpu_reduce_or(a::AbstractArray{<:Integer}, dims::Integer) = reduce((x, y) -> x | y, a, init=zero(eltype(a)), dims=dims)
+cpu_reduce_xor(a::AbstractArray{<:Integer}, dims::Integer) = reduce((x, y) -> x ⊻ y, a, init=zero(eltype(a)), dims=dims)
 
 #======================================================================#
 # Float32 operations

From baa5e658962d8bbc99ae28377152698920a9348c Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 03:26:30 +0530
Subject: [PATCH 21/29] bytecode: fix zigzag encoding for signed varint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The original code tried to convert Int64 directly to UInt64, which fails
for negative values like typemin(Int32) = -2147483648.

Zigzag encoding maps: (n << 1) ⊻ (n >> 63), enabling proper encoding of
negative integers in varint format.
---
 src/bytecode/writer.jl | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index 98b32be..7990f38 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -327,8 +327,15 @@ end
 Encode a signed integer as a variable-length integer.
 Uses zigzag encoding for signed values.
 """
-function encode_signed_varint!(buf::Vector{UInt8}, value::Union{UInt16, UInt32, UInt64, Int64})
-    # For float bits, encode as unsigned varint
+function encode_signed_varint!(buf::Vector{UInt8}, value::Int64)
+    # Zigzag encoding: (n << 1) ⊻ (n >> 63)
+    # This maps negative values to positive odd numbers
+    encoded = (UInt64(value) << 1) ⊻ (UInt64(value) >> 63)
+    encode_varint!(buf, encoded)
+end
+
+function encode_signed_varint!(buf::Vector{UInt8}, value::Union{UInt16, UInt32, UInt64})
+    # For unsigned types, just encode as-is
     encode_varint!(buf, UInt64(value))
 end
 

From 90fca21a888e34cb535783811ea2af4c4415892c Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 04:26:09 +0530
Subject: [PATCH 22/29] reverting zigzag encoding

---
 src/bytecode/writer.jl | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index 7990f38..8e428ee 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -327,14 +327,8 @@ end
 Encode a signed integer as a variable-length integer.
 Uses zigzag encoding for signed values.
 """
-function encode_signed_varint!(buf::Vector{UInt8}, value::Int64)
-    # Zigzag encoding: (n << 1) ⊻ (n >> 63)
-    # This maps negative values to positive odd numbers
-    encoded = (UInt64(value) << 1) ⊻ (UInt64(value) >> 63)
-    encode_varint!(buf, encoded)
-end
 
-function encode_signed_varint!(buf::Vector{UInt8}, value::Union{UInt16, UInt32, UInt64})
+function encode_signed_varint!(buf::Vector{UInt8}, value::Union{UInt16, UInt32, UInt64, Int64})
     # For unsigned types, just encode as-is
     encode_varint!(buf, UInt64(value))
 end

From 2eb3171161e301528c82f5172a50cae33756011d Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 04:56:01 +0530
Subject: [PATCH 23/29] bytecode: fix zigzag encoding for signed varint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Zigzag encoding: (n << 1) ⊻ (n >> 63) properly handles negative values
like typemin(Int32) = -2147483648.

Unsigned values use plain varint encoding since they don't need zigzag.
---
 src/bytecode/writer.jl | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index 8e428ee..7990f38 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -327,8 +327,14 @@ end
 Encode a signed integer as a variable-length integer.
 Uses zigzag encoding for signed values.
 """
+function encode_signed_varint!(buf::Vector{UInt8}, value::Int64)
+    # Zigzag encoding: (n << 1) ⊻ (n >> 63)
+    # This maps negative values to positive odd numbers
+    encoded = (UInt64(value) << 1) ⊻ (UInt64(value) >> 63)
+    encode_varint!(buf, encoded)
+end
 
-function encode_signed_varint!(buf::Vector{UInt8}, value::Union{UInt16, UInt32, UInt64, Int64})
+function encode_signed_varint!(buf::Vector{UInt8}, value::Union{UInt16, UInt32, UInt64})
     # For unsigned types, just encode as-is
     encode_varint!(buf, UInt64(value))
 end

From 5dea6f351d66a4ec415fdc3562a195bd9af968f2 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 04:58:05 +0530
Subject: [PATCH 24/29] bytecode: remove duplicate encode_signed_varint!

The correct implementation is in src/bytecode/basic.jl:
  function encode_signed_varint!(buf, x)
      x = x << 1
      if x < 0
          x = ~x
      end
      encode_varint!(buf, x)
  end

The duplicate in writer.jl was shadowing the correct one.
---
 src/bytecode/writer.jl | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index 7990f38..7973c95 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -324,21 +324,6 @@ end
 """
     encode_signed_varint!(buf, value)
 
-Encode a signed integer as a variable-length integer.
-Uses zigzag encoding for signed values.
-"""
-function encode_signed_varint!(buf::Vector{UInt8}, value::Int64)
-    # Zigzag encoding: (n << 1) ⊻ (n >> 63)
-    # This maps negative values to positive odd numbers
-    encoded = (UInt64(value) << 1) ⊻ (UInt64(value) >> 63)
-    encode_varint!(buf, encoded)
-end
-
-function encode_signed_varint!(buf::Vector{UInt8}, value::Union{UInt16, UInt32, UInt64})
-    # For unsigned types, just encode as-is
-    encode_varint!(buf, UInt64(value))
-end
-
 """
     encode_identity_array!(cb, identities)
 

From 6dc89b4c68564d1cc88810c09e59460b3f6f0687 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 05:26:29 +0530
Subject: [PATCH 25/29] intrinsics: pass signedness to encode_MinIOp! and
 encode_MaxIOp!

For unsigned integer types like UInt16, UInt32, the comparison must use
unsigned signedness, not the default SignednessSigned.

This fixes wrong reduction results for unsigned types where signed comparison
was causing values to be interpreted incorrectly (e.g., 0xFFFF interpreted as -1).
---
 src/compiler/intrinsics/core.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index 6ac2233..8ef8fbf 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -720,11 +720,11 @@ encode_reduce_body(cb, type, acc, elem, ::Val{:min}, ::Type{T}) where T <: Abstr
 encode_reduce_body(cb, type, acc, elem, ::Val{:add}, ::Type{T}) where T <: Integer =
     encode_AddIOp!(cb, type, acc, elem)
 encode_reduce_body(cb, type, acc, elem, ::Val{:max}, ::Type{T}) where T <: Integer =
-    encode_MaxIOp!(cb, type, acc, elem)
+    encode_MaxIOp!(cb, type, acc, elem; signedness=is_signed(T) ? SignednessSigned : SignednessUnsigned)
 encode_reduce_body(cb, type, acc, elem, ::Val{:mul}, ::Type{T}) where T <: Integer =
     encode_MulIOp!(cb, type, acc, elem)
 encode_reduce_body(cb, type, acc, elem, ::Val{:min}, ::Type{T}) where T <: Integer =
-    encode_MinIOp!(cb, type, acc, elem)
+    encode_MinIOp!(cb, type, acc, elem; signedness=is_signed(T) ? SignednessSigned : SignednessUnsigned)
 
 
 # less likely commutative/associative ops can be reduced too for whatever reason.

From db685ae4693ba22fff88af016814f97190f39a76 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 05:30:49 +0530
Subject: [PATCH 26/29] reverting original encode_signed_varint

---
 examples/reducekernel.jl | 20 ++++++++++++++++++++
 src/bytecode/writer.jl   |  9 +++++++++
 2 files changed, 29 insertions(+)
 create mode 100644 examples/reducekernel.jl

diff --git a/examples/reducekernel.jl b/examples/reducekernel.jl
new file mode 100644
index 0000000..7bb485b
--- /dev/null
+++ b/examples/reducekernel.jl
@@ -0,0 +1,20 @@
+using Test
+using CUDA
+using cuTile
+import cuTile as ct
+
+elType = UInt16
+function reduceKernel(a::ct.TileArray{elType,1}, b::ct.TileArray{elType,1}, tileSz::ct.Constant{Int})
+    bid = ct.bid(1)
+    tile = ct.load(a, bid, (tileSz[],))
+    result = ct.reduce_min(tile, Val(1))
+    ct.store(b, bid, result)
+    return nothing
+end
+
+sz = 32
+N = 2^15
+a = CUDA.rand(elType, N)
+b = CUDA.zeros(elType, cld(N, sz))
+CUDA.@sync ct.launch(reduceKernel, cld(length(a), sz), a, b, ct.Constant(sz))
+res = Array(b)
diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index 7973c95..8e428ee 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -324,6 +324,15 @@ end
 """
     encode_signed_varint!(buf, value)
 
+Encode a signed integer as a variable-length integer.
+Uses zigzag encoding for signed values.
+"""
+
+function encode_signed_varint!(buf::Vector{UInt8}, value::Union{UInt16, UInt32, UInt64, Int64})
+    # For unsigned types, just encode as-is
+    encode_varint!(buf, UInt64(value))
+end
+
 """
     encode_identity_array!(cb, identities)
 

From 6a1b21dac499986118551667897e7509a3b03178 Mon Sep 17 00:00:00 2001
From: Arhik <arhik23@gmail.com>
Date: Mon, 12 Jan 2026 05:32:05 +0530
Subject: [PATCH 27/29] revert comment inside encode_signed_varint

---
 src/bytecode/writer.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index 8e428ee..303e78a 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -329,7 +329,7 @@ Uses zigzag encoding for signed values.
 """
 
 function encode_signed_varint!(buf::Vector{UInt8}, value::Union{UInt16, UInt32, UInt64, Int64})
-    # For unsigned types, just encode as-is
+    # For float bits, encode as unsigned varint
     encode_varint!(buf, UInt64(value))
 end
 

From be4265f01cf5e1bbb2bb5c58e9f49023a04693fd Mon Sep 17 00:00:00 2001
From: arhik <arhik23@gmail.com>
Date: Fri, 16 Jan 2026 14:03:17 +0000
Subject: [PATCH 28/29] Fix SLEB128 zigzag encoding for 64-bit and small
 integer types

- writer.jl: Changed IntegerIdentityOp.value from Int64 to UInt128 to store full
  64-bit unsigned values. Added mask_to_width() to mask values to correct bit
  width before zigzag encoding.

- core.jl: Added to_uint128() helper to convert signed/unsigned values to
  UInt128 via bit reinterpretation for proper identity value storage.

- examples/reducekernel.jl: Added comprehensive tests for all reduce operations
  (min, max, sum, xor, or, and) on UInt16/32/64, Int16/32/64, and Float16/32/64.

Fixes:
- UInt64 reduce_min and reduce_and now work correctly
- Int16 reduce_max and reduce_and now work correctly
- All small integer types (Int8, Int16, Int32) now encode properly with SLEB128
---
 examples/reducekernel.jl        | 159 +++++++++++++++++++++++++++++---
 src/bytecode/writer.jl          |  42 ++++++++-
 src/compiler/intrinsics/core.jl |  32 +++++--
 3 files changed, 209 insertions(+), 24 deletions(-)

diff --git a/examples/reducekernel.jl b/examples/reducekernel.jl
index 7bb485b..f93c094 100644
--- a/examples/reducekernel.jl
+++ b/examples/reducekernel.jl
@@ -3,18 +3,151 @@ using CUDA
 using cuTile
 import cuTile as ct
 
-elType = UInt16
-function reduceKernel(a::ct.TileArray{elType,1}, b::ct.TileArray{elType,1}, tileSz::ct.Constant{Int})
-    bid = ct.bid(1)
-    tile = ct.load(a, bid, (tileSz[],))
-    result = ct.reduce_min(tile, Val(1))
-    ct.store(b, bid, result)
-    return nothing
+# Kernel factory to properly capture element type and operation
+function makeReduceKernel(::Type{T}, op::Symbol) where {T}
+    reduceFunc = if op == :reduce_min
+        ct.reduce_min
+    elseif op == :reduce_max
+        ct.reduce_max
+    elseif op == :reduce_sum
+        ct.reduce_sum
+    elseif op == :reduce_xor
+        ct.reduce_xor
+    elseif op == :reduce_or
+        ct.reduce_or
+    elseif op == :reduce_and
+        ct.reduce_and
+    end
+
+    @inline function kernel(a::ct.TileArray{T,1}, b::ct.TileArray{T,1}, tileSz::ct.Constant{Int})
+        ct.store(b, ct.bid(1), reduceFunc(ct.load(a, ct.bid(1), (tileSz[],)), Val(1)))
+        return nothing
+    end
+    return kernel
+end
+
+# Test with UInt types
+@testset for elType in [UInt16, UInt32, UInt64]
+    @testset for op in [:reduce_min, :reduce_max, :reduce_sum, :reduce_xor, :reduce_or, :reduce_and]
+        sz = 32
+        N = 2^15
+
+        # Create kernel using factory
+        reduceKernel = try
+            makeReduceKernel(elType, op)
+        catch e
+            @test_broken false
+            rethrow()
+        end
+
+        # Create data and run kernel
+        a_gpu = CUDA.rand(elType, N)
+        b_gpu = CUDA.zeros(elType, cld(N, sz))
+        try
+            CUDA.@sync ct.launch(reduceKernel, cld(length(a_gpu), sz), a_gpu, b_gpu, ct.Constant(sz))
+        catch e
+            @test_broken false
+            rethrow()
+        end
+        res = Array(b_gpu)
+
+        # CPU computation
+        a_cpu = Array(a_gpu)
+        a_reshaped = reshape(a_cpu, sz, :)
+
+        if op == :reduce_min
+            cpu_result = minimum(a_reshaped, dims=1)[:]
+        elseif op == :reduce_max
+            cpu_result = maximum(a_reshaped, dims=1)[:]
+        elseif op == :reduce_sum
+            raw_sum = sum(a_reshaped, dims=1)[:]
+            cpu_result = raw_sum .& typemax(elType)
+        elseif op == :reduce_xor
+            cpu_result = mapslices(x -> reduce(⊻, x), a_reshaped, dims=1)[:]
+        elseif op == :reduce_or
+            cpu_result = mapslices(x -> reduce(|, x), a_reshaped, dims=1)[:]
+        elseif op == :reduce_and
+            cpu_result = mapslices(x -> reduce(&, x), a_reshaped, dims=1)[:]
+        end
+
+        @test cpu_result == res
+    end
+end
+
+# Test with signed Int types
+@testset for elType in [Int16, Int32, Int64]
+    @testset for op in [:reduce_min, :reduce_max, :reduce_sum, :reduce_xor, :reduce_or, :reduce_and]
+        sz = 32
+        N = 2^15
+
+        # Create kernel using factory
+        reduceKernel = try
+            makeReduceKernel(elType, op)
+        catch e
+            @test_broken false
+            rethrow()
+        end
+
+        # Create data and run kernel - use range to get negative values too
+        a_gpu = CuArray{elType}(rand(-1000:1000, N))
+        b_gpu = CUDA.zeros(elType, cld(N, sz))
+        try
+            CUDA.@sync ct.launch(reduceKernel, cld(length(a_gpu), sz), a_gpu, b_gpu, ct.Constant(sz))
+        catch e
+            @test_broken false
+            rethrow()
+        end
+        res = Array(b_gpu)
+
+        # CPU computation
+        a_cpu = Array(a_gpu)
+        a_reshaped = reshape(a_cpu, sz, :)
+
+        if op == :reduce_min
+            cpu_result = minimum(a_reshaped, dims=1)[:]
+        elseif op == :reduce_max
+            cpu_result = maximum(a_reshaped, dims=1)[:]
+        elseif op == :reduce_sum
+            cpu_result = sum(a_reshaped, dims=1)[:]
+        elseif op == :reduce_xor
+            cpu_result = mapslices(x -> reduce(⊻, x), a_reshaped, dims=1)[:]
+        elseif op == :reduce_or
+            cpu_result = mapslices(x -> reduce(|, x), a_reshaped, dims=1)[:]
+        elseif op == :reduce_and
+            cpu_result = mapslices(x -> reduce(&, x), a_reshaped, dims=1)[:]
+        end
+
+        @test cpu_result == res
+    end
 end
 
-sz = 32
-N = 2^15
-a = CUDA.rand(elType, N)
-b = CUDA.zeros(elType, cld(N, sz))
-CUDA.@sync ct.launch(reduceKernel, cld(length(a), sz), a, b, ct.Constant(sz))
-res = Array(b)
+# Test with Float types
+@testset for elType in [Float16, Float32, Float64]
+    @testset for op in [:reduce_min, :reduce_max, :reduce_sum]
+        sz = 32
+        N = 2^15
+
+        # Create kernel using factory
+        reduceKernel = makeReduceKernel(elType, op)
+
+        # Create data and run kernel
+        a_gpu = CUDA.rand(elType, N)
+        b_gpu = CUDA.zeros(elType, cld(N, sz))
+        CUDA.@sync ct.launch(reduceKernel, cld(length(a_gpu), sz), a_gpu, b_gpu, ct.Constant(sz))
+        res = Array(b_gpu)
+
+        # CPU computation
+        a_cpu = Array(a_gpu)
+        a_reshaped = reshape(a_cpu, sz, :)
+
+        if op == :reduce_min
+            cpu_result = minimum(a_reshaped, dims=1)[:]
+        elseif op == :reduce_max
+            cpu_result = maximum(a_reshaped, dims=1)[:]
+        elseif op == :reduce_sum
+            cpu_result = sum(a_reshaped, dims=1)[:]
+        end
+
+        @test isapprox(cpu_result, res)
+    end
+end
\ No newline at end of file
diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index 303e78a..a6f34c1 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -257,7 +257,7 @@ end
 Integer identity value for binary operations.
 """
 struct IntegerIdentityOp <: IdentityOp
-    value::Int64  # Store as signed Int64, will be reinterpreted as unsigned
+    value::UInt128  # Store as UInt128 to handle all unsigned values up to 64 bits
     type_id::TypeId
     dtype::Type   # Int8, Int16, Int32, Int64, UInt8, etc.
     signed::Bool  # true for signed, false for unsigned
@@ -291,13 +291,47 @@ function encode_tagged_int!(cb::CodeBuilder, identity::IntegerIdentityOp)
     # Type ID
     encode_typeid!(cb.buf, identity.type_id)
     # Value: signed uses zigzag varint, unsigned uses plain varint
+    # Mask value to correct bit width and apply zigzag for signed types
+    masked_value = mask_to_width(identity.value, identity.dtype, identity.signed)
     if identity.signed
-        encode_signed_varint!(cb.buf, identity.value)
+        encode_signed_varint!(cb.buf, masked_value)
     else
-        encode_varint!(cb.buf, UInt64(identity.value))
+        encode_varint!(cb.buf, masked_value)
     end
 end
 
+"""
+    mask_to_width(value, dtype, signed)
+
+Mask a UInt128 value to the correct bit width for the given type and apply zigzag if signed.
+For signed types, this masks first, then applies zigzag encoding.
+"""
+# Signed Int64: mask to 64 bits first, then zigzag encode
+mask_to_width(value::UInt128, ::Type{Int64}, signed::Bool) = 
+    let masked = UInt64(value & 0xFFFFFFFFFFFFFFFF)
+        UInt64((masked << 1) ⊻ (masked >>> 63))
+    end
+# Signed Int32: mask to 32 bits first, then zigzag encode
+mask_to_width(value::UInt128, ::Type{Int32}, signed::Bool) = 
+    let masked = UInt32(value & 0xFFFFFFFF)
+        UInt32((masked << 1) ⊻ (masked >>> 31))
+    end
+# Signed Int16: mask to 16 bits first, then zigzag encode
+mask_to_width(value::UInt128, ::Type{Int16}, signed::Bool) = 
+    let masked = UInt16(value & 0xFFFF)
+        UInt16((masked << 1) ⊻ (masked >>> 15))
+    end
+# Signed Int8: mask to 8 bits first, then zigzag encode
+mask_to_width(value::UInt128, ::Type{Int8}, signed::Bool) = 
+    let masked = UInt8(value & 0xFF)
+        UInt8((masked << 1) ⊻ (masked >>> 7))
+    end
+# Unsigned types: just mask to bit width, no zigzag
+mask_to_width(value::UInt128, ::Type{UInt64}, signed::Bool) = UInt64(value & 0xFFFFFFFFFFFFFFFF)
+mask_to_width(value::UInt128, ::Type{UInt32}, signed::Bool) = UInt32(value & 0xFFFFFFFF)
+mask_to_width(value::UInt128, ::Type{UInt16}, signed::Bool) = UInt16(value & 0xFFFF)
+mask_to_width(value::UInt128, ::Type{UInt8}, signed::Bool) = UInt8(value & 0xFF)
+
 """
     float_to_bits(value, dtype)
 
@@ -585,7 +619,7 @@ function finalize_function!(func_buf::Vector{UInt8}, cb::CodeBuilder,
 end
 
 #=============================================================================
- Optimization Hints 
+ Optimization Hints
 =============================================================================#
 
 """
diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index 8ef8fbf..53d35af 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -667,43 +667,61 @@ is_signed(::Type{T}) where T <: AbstractFloat = false
 Return the identity value for a binary operation (reduce, scan, etc.).
 Identity must satisfy: identity ⊕ x = x for the operation.
 """
+
+"""
+    to_uint128(value, dtype)
+
+Convert an integer value to UInt128 for storage in IntegerIdentityOp.
+For signed types, this returns the two's complement bit representation.
+"""
+# Unsigned types: directly convert
+to_uint128(value::UInt64) = UInt128(value)
+to_uint128(value::UInt32) = UInt128(value)
+to_uint128(value::UInt16) = UInt128(value)
+to_uint128(value::UInt8) = UInt128(value)
+# Signed types: reinterpret as unsigned first, then convert
+to_uint128(value::Int64) = UInt128(reinterpret(UInt64, value))
+to_uint128(value::Int32) = UInt128(reinterpret(UInt32, value))
+to_uint128(value::Int16) = UInt128(reinterpret(UInt16, value))
+to_uint128(value::Int8) = UInt128(reinterpret(UInt8, value))
+
 # Addition identity: 0 + x = x
 operation_identity(::Val{:add}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentityOp(zero(T), dtype, T)
 operation_identity(::Val{:add}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentityOp(zero(T), dtype, T, is_signed(T))
+    IntegerIdentityOp(to_uint128(zero(T)), dtype, T, is_signed(T))
 
 # Maximum identity: max(typemin(T), x) = x
 operation_identity(::Val{:max}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentityOp(typemin(T), dtype, T)
 operation_identity(::Val{:max}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentityOp(typemin(T), dtype, T, is_signed(T))
+    IntegerIdentityOp(to_uint128(typemin(T)), dtype, T, is_signed(T))
 
 # Multiplication identity: 1 * x = x
 operation_identity(::Val{:mul}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentityOp(one(T), dtype, T)
 operation_identity(::Val{:mul}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentityOp(one(T), dtype, T, is_signed(T))
+    IntegerIdentityOp(to_uint128(one(T)), dtype, T, is_signed(T))
 
 # Minimum identity: min(typemax(T), x) = x
 operation_identity(::Val{:min}, dtype, ::Type{T}) where T <: AbstractFloat =
     FloatIdentityOp(typemax(T), dtype, T)
 operation_identity(::Val{:min}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentityOp(typemax(T), dtype, T, is_signed(T))
+    IntegerIdentityOp(to_uint128(typemax(T)), dtype, T, is_signed(T))
 
 # AND identity: all bits set (x & identity == x)
 # For signed: -one(T) has all bits set in two's complement
 # For unsigned: typemax(T) has all bits set
 operation_identity(::Val{:and}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentityOp(is_signed(T) ? -one(T) : typemax(T), dtype, T, is_signed(T))
+    IntegerIdentityOp(to_uint128(is_signed(T) ? -one(T) : typemax(T)), dtype, T, is_signed(T))
 
 # OR identity: 0 | x = x
 operation_identity(::Val{:or}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentityOp(zero(T), dtype, T, is_signed(T))
+    IntegerIdentityOp(to_uint128(zero(T)), dtype, T, is_signed(T))
 
 # XOR identity: 0 ⊕ x = x
 operation_identity(::Val{:xor}, dtype, ::Type{T}) where T <: Integer =
-    IntegerIdentityOp(zero(T), dtype, T, is_signed(T))
+    IntegerIdentityOp(to_uint128(zero(T)), dtype, T, is_signed(T))
 
 #=============================================================================
  Reduce Body Operations - dispatch on Val{fn} and elem_type

From eb34ca3615bfb425dd0fba9473014bf9f810f58b Mon Sep 17 00:00:00 2001
From: arhik <arhik23@gmail.com>
Date: Fri, 16 Jan 2026 15:20:07 +0000
Subject: [PATCH 29/29] Simplify reduce_ops.jl tests with broadcasting

- Use kernel factory pattern matching reducekernel.jl style
- Replace per-row loops with simpler broadcasting approach
- Consolidate tests using @testset for loops over element types and operations
- Streamline CPU reference computation
---
 test/reduce_ops.jl | 890 ++++++++-------------------------------------
 1 file changed, 147 insertions(+), 743 deletions(-)

diff --git a/test/reduce_ops.jl b/test/reduce_ops.jl
index 36dc9ee..4250904 100644
--- a/test/reduce_ops.jl
+++ b/test/reduce_ops.jl
@@ -1,750 +1,154 @@
+
 using cuTile
 import cuTile as ct
 using CUDA
 using Test
 
-@testset "reduce operations" begin
-
-#======================================================================#
-# CPU reference implementations
-# =====================================================================#
-
-cpu_reduce_add(a::AbstractArray, dims::Integer) = sum(a, dims=dims)
-cpu_reduce_mul(a::AbstractArray, dims::Integer) = prod(a, dims=dims)
-cpu_reduce_max(a::AbstractArray, dims::Integer) = maximum(a, dims=dims)
-cpu_reduce_min(a::AbstractArray, dims::Integer) = minimum(a, dims=dims)
-
-cpu_reduce_and(a::AbstractArray{<:Unsigned}, dims::Integer) = reduce((x, y) -> x & y, a, init=typemax(eltype(a)), dims=dims)
-cpu_reduce_and(a::AbstractArray{<:Signed}, dims::Integer) = reduce((x, y) -> x & y, a, init=Int64(-1), dims=dims)
-cpu_reduce_or(a::AbstractArray{<:Integer}, dims::Integer) = reduce((x, y) -> x | y, a, init=zero(eltype(a)), dims=dims)
-cpu_reduce_xor(a::AbstractArray{<:Integer}, dims::Integer) = reduce((x, y) -> x ⊻ y, a, init=zero(eltype(a)), dims=dims)
-
-#======================================================================#
-# Float32 operations
-#======================================================================#
-
-@testset "Float32 reduce_add" begin
-    function reduce_add_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (pid, 1), (1, 128))
-        sums = ct.reduce_sum(tile, 2)
-        ct.store(b, pid, sums)
-        return
-    end
-
-    m, n = 64, 128
-    a = CUDA.rand(Float32, m, n)
-    b = CUDA.zeros(Float32, m)
-
-    ct.launch(reduce_add_kernel, m, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for i in 1:m
-        @test b_cpu[i] ≈ cpu_reduce_add(a_cpu[i:i, :], 2)[1] rtol=1e-3
-    end
-end
-
-@testset "Float32 reduce_mul" begin
-    function reduce_mul_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (pid, 1), (1, 32))
-        products = ct.reduce_mul(tile, 2)
-        ct.store(b, pid, products)
-        return
-    end
-
-    m, n = 32, 64
-    a = CUDA.rand(Float32, m, n) .+ 0.1f0
-    b = CUDA.ones(Float32, m)
-
-    ct.launch(reduce_mul_kernel, m, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for i in 1:m
-        @test b_cpu[i] ≈ cpu_reduce_mul(a_cpu[i:i, :], 2)[1] rtol=1e-2
+# Kernel factory to properly capture element type and operation
+function makeReduceKernel(::Type{T}, op::Symbol) where {T}
+    reduceFunc = if op == :reduce_min
+        ct.reduce_min
+    elseif op == :reduce_max
+        ct.reduce_max
+    elseif op == :reduce_sum
+        ct.reduce_sum
+    elseif op == :reduce_xor
+        ct.reduce_xor
+    elseif op == :reduce_or
+        ct.reduce_or
+    elseif op == :reduce_and
+        ct.reduce_and
+    end
+
+    @inline function kernel(a::ct.TileArray{T,1}, b::ct.TileArray{T,1}, tileSz::ct.Constant{Int})
+        ct.store(b, ct.bid(1), reduceFunc(ct.load(a, ct.bid(1), (tileSz[],)), Val(1)))
+        return nothing
+    end
+    return kernel
+end
+
+# Test with UInt types
+@testset for elType in [UInt16, UInt32, UInt64]
+    @testset for op in [:reduce_min, :reduce_max, :reduce_sum, :reduce_xor, :reduce_or, :reduce_and]
+        sz = 32
+        N = 2^15
+
+        # Create kernel using factory
+        reduceKernel = try
+            makeReduceKernel(elType, op)
+        catch e
+            @test_broken false
+            rethrow()
+        end
+
+        # Create data and run kernel
+        a_gpu = CUDA.rand(elType, N)
+        b_gpu = CUDA.zeros(elType, cld(N, sz))
+        try
+            CUDA.@sync ct.launch(reduceKernel, cld(length(a_gpu), sz), a_gpu, b_gpu, ct.Constant(sz))
+        catch e
+            @test_broken false
+            rethrow()
+        end
+        res = Array(b_gpu)
+
+        # CPU computation
+        a_cpu = Array(a_gpu)
+        a_reshaped = reshape(a_cpu, sz, :)
+
+        if op == :reduce_min
+            cpu_result = minimum(a_reshaped, dims=1)[:]
+        elseif op == :reduce_max
+            cpu_result = maximum(a_reshaped, dims=1)[:]
+        elseif op == :reduce_sum
+            raw_sum = sum(a_reshaped, dims=1)[:]
+            cpu_result = raw_sum .& typemax(elType)
+        elseif op == :reduce_xor
+            cpu_result = mapslices(x -> reduce(⊻, x), a_reshaped, dims=1)[:]
+        elseif op == :reduce_or
+            cpu_result = mapslices(x -> reduce(|, x), a_reshaped, dims=1)[:]
+        elseif op == :reduce_and
+            cpu_result = mapslices(x -> reduce(&, x), a_reshaped, dims=1)[:]
+        end
+
+        @test cpu_result == res
+    end
+end
+
+# Test with signed Int types
+@testset for elType in [Int16, Int32, Int64]
+    @testset for op in [:reduce_min, :reduce_max, :reduce_sum, :reduce_xor, :reduce_or, :reduce_and]
+        sz = 32
+        N = 2^15
+
+        # Create kernel using factory
+        reduceKernel = try
+            makeReduceKernel(elType, op)
+        catch e
+            @test_broken false
+            rethrow()
+        end
+
+        # Create data and run kernel - use range to get negative values too
+        a_gpu = CuArray{elType}(rand(-1000:1000, N))
+        b_gpu = CUDA.zeros(elType, cld(N, sz))
+        try
+            CUDA.@sync ct.launch(reduceKernel, cld(length(a_gpu), sz), a_gpu, b_gpu, ct.Constant(sz))
+        catch e
+            @test_broken false
+            rethrow()
+        end
+        res = Array(b_gpu)
+
+        # CPU computation
+        a_cpu = Array(a_gpu)
+        a_reshaped = reshape(a_cpu, sz, :)
+
+        if op == :reduce_min
+            cpu_result = minimum(a_reshaped, dims=1)[:]
+        elseif op == :reduce_max
+            cpu_result = maximum(a_reshaped, dims=1)[:]
+        elseif op == :reduce_sum
+            cpu_result = sum(a_reshaped, dims=1)[:]
+        elseif op == :reduce_xor
+            cpu_result = mapslices(x -> reduce(⊻, x), a_reshaped, dims=1)[:]
+        elseif op == :reduce_or
+            cpu_result = mapslices(x -> reduce(|, x), a_reshaped, dims=1)[:]
+        elseif op == :reduce_and
+            cpu_result = mapslices(x -> reduce(&, x), a_reshaped, dims=1)[:]
+        end
+
+        @test cpu_result == res
+    end
+end
+
+# Test with Float types
+@testset for elType in [Float16, Float32, Float64]
+    @testset for op in [:reduce_min, :reduce_max, :reduce_sum]
+        sz = 32
+        N = 2^15
+
+        # Create kernel using factory
+        reduceKernel = makeReduceKernel(elType, op)
+
+        # Create data and run kernel
+        a_gpu = CUDA.rand(elType, N)
+        b_gpu = CUDA.zeros(elType, cld(N, sz))
+        CUDA.@sync ct.launch(reduceKernel, cld(length(a_gpu), sz), a_gpu, b_gpu, ct.Constant(sz))
+        res = Array(b_gpu)
+
+        # CPU computation
+        a_cpu = Array(a_gpu)
+        a_reshaped = reshape(a_cpu, sz, :)
+
+        if op == :reduce_min
+            cpu_result = minimum(a_reshaped, dims=1)[:]
+        elseif op == :reduce_max
+            cpu_result = maximum(a_reshaped, dims=1)[:]
+        elseif op == :reduce_sum
+            cpu_result = sum(a_reshaped, dims=1)[:]
+        end
+
+        @test isapprox(cpu_result, res)
     end
 end
-
-@testset "Float32 reduce_max" begin
-    function reduce_max_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (pid, 1), (1, 128))
-        maxes = ct.reduce_max(tile, 2)
-        ct.store(b, pid, maxes)
-        return
-    end
-
-    m, n = 64, 128
-    a = CUDA.rand(Float32, m, n)
-    b = CUDA.zeros(Float32, m)
-
-    ct.launch(reduce_max_kernel, m, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for i in 1:m
-        @test b_cpu[i] ≈ cpu_reduce_max(a_cpu[i:i, :], 2)[1] rtol=1e-5
-    end
-end
-
-@testset "Float32 reduce_min" begin
-    function reduce_min_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (pid, 1), (1, 128))
-        mins = ct.reduce_min(tile, 2)
-        ct.store(b, pid, mins)
-        return
-    end
-
-    m, n = 64, 128
-    a = CUDA.rand(Float32, m, n)
-    b = CUDA.zeros(Float32, m)
-
-    ct.launch(reduce_min_kernel, m, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for i in 1:m
-        @test b_cpu[i] ≈ cpu_reduce_min(a_cpu[i:i, :], 2)[1] rtol=1e-5
-    end
-end
-
-#======================================================================#
-# Float64 operations
-#======================================================================#
-
-@testset "Float64 reduce_add" begin
-    function reduce_add_f64_kernel(a::ct.TileArray{Float64,2}, b::ct.TileArray{Float64,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (pid, 1), (1, 64))
-        sums = ct.reduce_sum(tile, 2)
-        ct.store(b, pid, sums)
-        return
-    end
-
-    m, n = 32, 64
-    a = CUDA.rand(Float64, m, n)
-    b = CUDA.zeros(Float64, m)
-
-    ct.launch(reduce_add_f64_kernel, m, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for i in 1:m
-        @test b_cpu[i] ≈ cpu_reduce_add(a_cpu[i:i, :], 2)[1] rtol=1e-5
-    end
-end
-
-@testset "Float64 reduce_max" begin
-    function reduce_max_f64_kernel(a::ct.TileArray{Float64,2}, b::ct.TileArray{Float64,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (pid, 1), (1, 64))
-        maxes = ct.reduce_max(tile, 2)
-        ct.store(b, pid, maxes)
-        return
-    end
-
-    m, n = 32, 64
-    a = CUDA.rand(Float64, m, n)
-    b = CUDA.zeros(Float64, m)
-
-    ct.launch(reduce_max_f64_kernel, m, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for i in 1:m
-        @test b_cpu[i] ≈ cpu_reduce_max(a_cpu[i:i, :], 2)[1] rtol=1e-5
-    end
-end
-
-@testset "Float64 reduce_min" begin
-    function reduce_min_f64_kernel(a::ct.TileArray{Float64,2}, b::ct.TileArray{Float64,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (pid, 1), (1, 64))
-        mins = ct.reduce_min(tile, 2)
-        ct.store(b, pid, mins)
-        return
-    end
-
-    m, n = 32, 64
-    a = CUDA.rand(Float64, m, n)
-    b = CUDA.zeros(Float64, m)
-
-    ct.launch(reduce_min_f64_kernel, m, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for i in 1:m
-        @test b_cpu[i] ≈ cpu_reduce_min(a_cpu[i:i, :], 2)[1] rtol=1e-5
-    end
-end
-
-@testset "Float64 reduce_mul" begin
-    function reduce_mul_f64_kernel(a::ct.TileArray{Float64,2}, b::ct.TileArray{Float64,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (pid, 1), (1, 32))
-        products = ct.reduce_mul(tile, 2)
-        ct.store(b, pid, products)
-        return
-    end
-
-    m, n = 16, 32
-    a = CUDA.rand(Float64, m, n) .+ 0.1
-    b = CUDA.ones(Float64, m)
-
-    ct.launch(reduce_mul_f64_kernel, m, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for i in 1:m
-        @test b_cpu[i] ≈ cpu_reduce_mul(a_cpu[i:i, :], 2)[1] rtol=1e-2
-    end
-end
-
-#======================================================================#
-# Int32 operations
-#======================================================================#
-
-@testset "Int32 reduce_add" begin
-    function reduce_add_i32_kernel(a::ct.TileArray{Int32,2}, b::ct.TileArray{Int32,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (pid, 1), (1, 64))
-        sums = ct.reduce_sum(tile, 2)
-        ct.store(b, pid, sums)
-        return
-    end
-
-    m, n = 32, 64
-    a = CUDA.rand(Int32, m, n) .+ 1
-    b = CUDA.zeros(Int32, m)
-
-    ct.launch(reduce_add_i32_kernel, m, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for i in 1:m
-        @test b_cpu[i] == cpu_reduce_add(a_cpu[i:i, :], 2)[1]
-    end
-end
-
-@testset "Int32 reduce_mul" begin
-    function reduce_mul_i32_kernel(a::ct.TileArray{Int32,2}, b::ct.TileArray{Int32,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (pid, 1), (1, 16))
-        products = ct.reduce_mul(tile, 2)
-        ct.store(b, pid, products)
-        return
-    end
-
-    m, n = 8, 16
-    a = CUDA.rand(Int32, m, n) .% 10 .+ 2
-    b = CUDA.ones(Int32, m)
-
-    ct.launch(reduce_mul_i32_kernel, m, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for i in 1:m
-        @test b_cpu[i] == cpu_reduce_mul(a_cpu[i:i, :], 2)[1]
-    end
-end
-
-@testset "Int32 reduce_max" begin
-    function reduce_max_i32_kernel(a::ct.TileArray{Int32,2}, b::ct.TileArray{Int32,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (pid, 1), (1, 64))
-        maxes = ct.reduce_max(tile, 2)
-        ct.store(b, pid, maxes)
-        return
-    end
-
-    m, n = 32, 64
-    a = CUDA.rand(Int32, m, n)
-    b = CUDA.fill(typemin(Int32), m)
-
-    ct.launch(reduce_max_i32_kernel, m, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for i in 1:m
-        @test b_cpu[i] == cpu_reduce_max(a_cpu[i:i, :], 2)[1]
-    end
-end
-
-@testset "Int32 reduce_min" begin
-    function reduce_min_i32_kernel(a::ct.TileArray{Int32,2}, b::ct.TileArray{Int32,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (pid, 1), (1, 64))
-        mins = ct.reduce_min(tile, 2)
-        ct.store(b, pid, mins)
-        return
-    end
-
-    m, n = 32, 64
-    a = CUDA.rand(Int32, m, n)
-    b = CUDA.fill(typemax(Int32), m)
-
-    ct.launch(reduce_min_i32_kernel, m, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for i in 1:m
-        @test b_cpu[i] == cpu_reduce_min(a_cpu[i:i, :], 2)[1]
-    end
-end
-
-@testset "Int32 reduce_and" begin
-    function reduce_and_i32_kernel(a::ct.TileArray{Int32,2}, b::ct.TileArray{Int32,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (pid, 1), (1, 32))
-        result = ct.reduce_and(tile, 2)
-        ct.store(b, pid, result)
-        return
-    end
-
-    m, n = 16, 32
-    a = CUDA.rand(Int32, m, n)
-    b = CUDA.zeros(Int32, m)
-
-    ct.launch(reduce_and_i32_kernel, m, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for i in 1:m
-        @test b_cpu[i] == cpu_reduce_and(a_cpu[i:i, :], 2)[1]
-    end
-end
-
-@testset "Int32 reduce_or" begin
-    function reduce_or_i32_kernel(a::ct.TileArray{Int32,2}, b::ct.TileArray{Int32,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (pid, 1), (1, 32))
-        result = ct.reduce_or(tile, 2)
-        ct.store(b, pid, result)
-        return
-    end
-
-    m, n = 16, 32
-    a = CUDA.rand(Int32, m, n)
-    b = CUDA.zeros(Int32, m)
-
-    ct.launch(reduce_or_i32_kernel, m, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for i in 1:m
-        @test b_cpu[i] == cpu_reduce_or(a_cpu[i:i, :], 2)[1]
-    end
-end
-
-@testset "Int32 reduce_xor" begin
-    function reduce_xor_i32_kernel(a::ct.TileArray{Int32,2}, b::ct.TileArray{Int32,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (pid, 1), (1, 32))
-        result = ct.reduce_xor(tile, 2)
-        ct.store(b, pid, result)
-        return
-    end
-
-    m, n = 16, 32
-    a = CUDA.rand(Int32, m, n)
-    b = CUDA.zeros(Int32, m)
-
-    ct.launch(reduce_xor_i32_kernel, m, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for i in 1:m
-        @test b_cpu[i] == cpu_reduce_xor(a_cpu[i:i, :], 2)[1]
-    end
-end
-
-#======================================================================#
-# UInt32 operations - tests AND identity encoding fix
-#======================================================================#
-
-@testset "UInt32 reduce_add" begin
-    function reduce_add_u32_kernel(a::ct.TileArray{UInt32,2}, b::ct.TileArray{UInt32,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (pid, 1), (1, 64))
-        sums = ct.reduce_sum(tile, 2)
-        ct.store(b, pid, sums)
-        return
-    end
-
-    m, n = 32, 64
-    a = CUDA.rand(UInt32, m, n)
-    b = CUDA.zeros(UInt32, m)
-
-    ct.launch(reduce_add_u32_kernel, m, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for i in 1:m
-        @test b_cpu[i] == cpu_reduce_add(a_cpu[i:i, :], 2)[1]
-    end
-end
-
-@testset "UInt32 reduce_mul" begin
-    function reduce_mul_u32_kernel(a::ct.TileArray{UInt32,2}, b::ct.TileArray{UInt32,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (pid, 1), (1, 16))
-        products = ct.reduce_mul(tile, 2)
-        ct.store(b, pid, products)
-        return
-    end
-
-    m, n = 8, 16
-    a = CUDA.rand(UInt32, m, n) .% 10 .+ 2
-    b = CUDA.ones(UInt32, m)
-
-    ct.launch(reduce_mul_u32_kernel, m, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for i in 1:m
-        @test b_cpu[i] == cpu_reduce_mul(a_cpu[i:i, :], 2)[1]
-    end
-end
-
-@testset "UInt32 reduce_max" begin
-    function reduce_max_u32_kernel(a::ct.TileArray{UInt32,2}, b::ct.TileArray{UInt32,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (pid, 1), (1, 64))
-        maxes = ct.reduce_max(tile, 2)
-        ct.store(b, pid, maxes)
-        return
-    end
-
-    m, n = 32, 64
-    a = CUDA.rand(UInt32, m, n)
-    b = CUDA.zeros(UInt32, m)
-
-    ct.launch(reduce_max_u32_kernel, m, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for i in 1:m
-        @test b_cpu[i] == cpu_reduce_max(a_cpu[i:i, :], 2)[1]
-    end
-end
-
-@testset "UInt32 reduce_min" begin
-    function reduce_min_u32_kernel(a::ct.TileArray{UInt32,2}, b::ct.TileArray{UInt32,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (pid, 1), (1, 64))
-        mins = ct.reduce_min(tile, 2)
-        ct.store(b, pid, mins)
-        return
-    end
-
-    m, n = 32, 64
-    a = CUDA.rand(UInt32, m, n)
-    b = CUDA.fill(typemax(UInt32), m)
-
-    ct.launch(reduce_min_u32_kernel, m, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for i in 1:m
-        @test b_cpu[i] == cpu_reduce_min(a_cpu[i:i, :], 2)[1]
-    end
-end
-
-@testset "UInt32 reduce_and" begin
-    function reduce_and_u32_kernel(a::ct.TileArray{UInt32,2}, b::ct.TileArray{UInt32,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (pid, 1), (1, 32))
-        result = ct.reduce_and(tile, 2)
-        ct.store(b, pid, result)
-        return
-    end
-
-    m, n = 16, 32
-    a = CUDA.rand(UInt32, m, n)
-    b = CUDA.zeros(UInt32, m)
-
-    ct.launch(reduce_and_u32_kernel, m, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for i in 1:m
-        @test b_cpu[i] == cpu_reduce_and(a_cpu[i:i, :], 2)[1]
-    end
-end
-
-@testset "UInt32 reduce_or" begin
-    function reduce_or_u32_kernel(a::ct.TileArray{UInt32,2}, b::ct.TileArray{UInt32,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (pid, 1), (1, 32))
-        result = ct.reduce_or(tile, 2)
-        ct.store(b, pid, result)
-        return
-    end
-
-    m, n = 16, 32
-    a = CUDA.rand(UInt32, m, n)
-    b = CUDA.zeros(UInt32, m)
-
-    ct.launch(reduce_or_u32_kernel, m, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for i in 1:m
-        @test b_cpu[i] == cpu_reduce_or(a_cpu[i:i, :], 2)[1]
-    end
-end
-
-@testset "UInt32 reduce_xor" begin
-    function reduce_xor_u32_kernel(a::ct.TileArray{UInt32,2}, b::ct.TileArray{UInt32,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (pid, 1), (1, 32))
-        result = ct.reduce_xor(tile, 2)
-        ct.store(b, pid, result)
-        return
-    end
-
-    m, n = 16, 32
-    a = CUDA.rand(UInt32, m, n)
-    b = CUDA.zeros(UInt32, m)
-
-    ct.launch(reduce_xor_u32_kernel, m, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for i in 1:m
-        @test b_cpu[i] == cpu_reduce_xor(a_cpu[i:i, :], 2)[1]
-    end
-end
-
-#======================================================================#
-# Int8 operations - smaller integer type for encoding tests
-#======================================================================#
-
-@testset "Int8 reduce_add" begin
-    function reduce_add_i8_kernel(a::ct.TileArray{Int8,2}, b::ct.TileArray{Int8,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (pid, 1), (1, 32))
-        sums = ct.reduce_sum(tile, 2)
-        ct.store(b, pid, sums)
-        return
-    end
-
-    m, n = 16, 32
-    a = CUDA.rand(Int8, m, n)
-    b = CUDA.zeros(Int8, m)
-
-    ct.launch(reduce_add_i8_kernel, m, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for i in 1:m
-        @test Int32(b_cpu[i]) == cpu_reduce_add(a_cpu[i:i, :], 2)[1]
-    end
-end
-
-@testset "Int8 reduce_max" begin
-    function reduce_max_i8_kernel(a::ct.TileArray{Int8,2}, b::ct.TileArray{Int8,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (pid, 1), (1, 32))
-        maxes = ct.reduce_max(tile, 2)
-        ct.store(b, pid, maxes)
-        return
-    end
-
-    m, n = 16, 32
-    a = CUDA.rand(Int8, m, n)
-    b = CUDA.fill(typemin(Int8), m)
-
-    ct.launch(reduce_max_i8_kernel, m, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for i in 1:m
-        @test b_cpu[i] == cpu_reduce_max(a_cpu[i:i, :], 2)[1]
-    end
-end
-
-@testset "Int8 reduce_min" begin
-    function reduce_min_i8_kernel(a::ct.TileArray{Int8,2}, b::ct.TileArray{Int8,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (pid, 1), (1, 32))
-        mins = ct.reduce_min(tile, 2)
-        ct.store(b, pid, mins)
-        return
-    end
-
-    m, n = 16, 32
-    a = CUDA.rand(Int8, m, n)
-    b = CUDA.fill(typemax(Int8), m)
-
-    ct.launch(reduce_min_i8_kernel, m, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for i in 1:m
-        @test b_cpu[i] == cpu_reduce_min(a_cpu[i:i, :], 2)[1]
-    end
-end
-
-@testset "Int8 reduce_and" begin
-    function reduce_and_i8_kernel(a::ct.TileArray{Int8,2}, b::ct.TileArray{Int8,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (pid, 1), (1, 16))
-        result = ct.reduce_and(tile, 2)
-        ct.store(b, pid, result)
-        return
-    end
-
-    m, n = 8, 16
-    a = CUDA.rand(Int8, m, n)
-    b = CUDA.zeros(Int8, m)
-
-    ct.launch(reduce_and_i8_kernel, m, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for i in 1:m
-        @test Int32(b_cpu[i]) == cpu_reduce_and(a_cpu[i:i, :], 2)[1]
-    end
-end
-
-@testset "Int8 reduce_or" begin
-    function reduce_or_i8_kernel(a::ct.TileArray{Int8,2}, b::ct.TileArray{Int8,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (pid, 1), (1, 16))
-        result = ct.reduce_or(tile, 2)
-        ct.store(b, pid, result)
-        return
-    end
-
-    m, n = 8, 16
-    a = CUDA.rand(Int8, m, n)
-    b = CUDA.zeros(Int8, m)
-
-    ct.launch(reduce_or_i8_kernel, m, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for i in 1:m
-        @test Int32(b_cpu[i]) == cpu_reduce_or(a_cpu[i:i, :], 2)[1]
-    end
-end
-
-@testset "Int8 reduce_xor" begin
-    function reduce_xor_i8_kernel(a::ct.TileArray{Int8,2}, b::ct.TileArray{Int8,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (pid, 1), (1, 16))
-        result = ct.reduce_xor(tile, 2)
-        ct.store(b, pid, result)
-        return
-    end
-
-    m, n = 8, 16
-    a = CUDA.rand(Int8, m, n)
-    b = CUDA.zeros(Int8, m)
-
-    ct.launch(reduce_xor_i8_kernel, m, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for i in 1:m
-        @test Int32(b_cpu[i]) == cpu_reduce_xor(a_cpu[i:i, :], 2)[1]
-    end
-end
-
-#======================================================================#
-# Axis 0 reductions - verify both axes work
-#======================================================================#
-
-@testset "axis 0 reduce_sum Float32" begin
-    function reduce_sum_axis0_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (1, pid), (64, 1))
-        sums = ct.reduce_sum(tile, 1)
-        ct.store(b, pid, sums)
-        return
-    end
-
-    m, n = 64, 128
-    a = CUDA.rand(Float32, m, n)
-    b = CUDA.zeros(Float32, n)
-
-    ct.launch(reduce_sum_axis0_kernel, n, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for j in 1:n
-        @test b_cpu[j] ≈ cpu_reduce_add(a_cpu[:, j:j], 1)[1] rtol=1e-3
-    end
-end
-
-@testset "axis 0 reduce_min Int32" begin
-    function reduce_min_axis0_i32_kernel(a::ct.TileArray{Int32,2}, b::ct.TileArray{Int32,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (1, pid), (32, 1))
-        mins = ct.reduce_min(tile, 1)
-        ct.store(b, pid, mins)
-        return
-    end
-
-    m, n = 32, 64
-    a = CUDA.rand(Int32, m, n)
-    b = CUDA.fill(typemax(Int32), n)
-
-    ct.launch(reduce_min_axis0_i32_kernel, n, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for j in 1:n
-        @test b_cpu[j] == cpu_reduce_min(a_cpu[:, j:j], 1)[1]
-    end
-end
-
-@testset "axis 0 reduce_max UInt32" begin
-    function reduce_max_axis0_u32_kernel(a::ct.TileArray{UInt32,2}, b::ct.TileArray{UInt32,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (1, pid), (32, 1))
-        maxes = ct.reduce_max(tile, 1)
-        ct.store(b, pid, maxes)
-        return
-    end
-
-    m, n = 32, 64
-    a = CUDA.rand(UInt32, m, n)
-    b = CUDA.zeros(UInt32, n)
-
-    ct.launch(reduce_max_axis0_u32_kernel, n, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for j in 1:n
-        @test b_cpu[j] == cpu_reduce_max(a_cpu[:, j:j], 1)[1]
-    end
-end
-
-@testset "axis 0 reduce_and UInt32" begin
-    function reduce_and_axis0_u32_kernel(a::ct.TileArray{UInt32,2}, b::ct.TileArray{UInt32,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, (1, pid), (16, 1))
-        result = ct.reduce_and(tile, 1)
-        ct.store(b, pid, result)
-        return
-    end
-
-    m, n = 16, 32
-    a = CUDA.rand(UInt32, m, n)
-    b = CUDA.fill(typemax(UInt32), n)
-
-    ct.launch(reduce_and_axis0_u32_kernel, n, a, b)
-
-    a_cpu = Array(a)
-    b_cpu = Array(b)
-    for j in 1:n
-        @test b_cpu[j] == cpu_reduce_and(a_cpu[:, j:j], 1)[1]
-    end
-end
-
-end  # @testset "reduce operations"