From 32143e4423734957e6386ad5c95978a5377128df Mon Sep 17 00:00:00 2001
From: Charles Kawczynski <kawczynski.charles@gmail.com>
Date: Wed, 23 Oct 2024 20:18:34 -0400
Subject: [PATCH] Ensure has_uniform_datalayouts for cuda copyto

---
 ext/cuda/data_layouts_copyto.jl | 68 +++++++++++++++++++++++----------
 1 file changed, 48 insertions(+), 20 deletions(-)

diff --git a/ext/cuda/data_layouts_copyto.jl b/ext/cuda/data_layouts_copyto.jl
index 8f8876fe52..8f14df0bc4 100644
--- a/ext/cuda/data_layouts_copyto.jl
+++ b/ext/cuda/data_layouts_copyto.jl
@@ -16,25 +16,18 @@ function knl_copyto_linear!(dest, src, us)
     return nothing
 end
 
-function Base.copyto!(dest::AbstractData, bc, ::ToCUDA)
-    (_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
-    us = DataLayouts.UniversalSize(dest)
-    if Nv > 0 && Nh > 0
-        if !(VERSION ≥ v"1.11.0-beta") && dest isa DataLayouts.EndsWithField
-            bc′ = Base.Broadcast.instantiate(
-                DataLayouts.to_non_extruded_broadcasted(bc),
-            )
-            args = (dest, bc′, us)
-            threads = threads_via_occupancy(knl_copyto_linear!, args)
-            n_max_threads = min(threads, get_N(us))
-            p = linear_partition(prod(size(dest)), n_max_threads)
-            auto_launch!(
-                knl_copyto_linear!,
-                args;
-                threads_s = p.threads,
-                blocks_s = p.blocks,
-            )
-        else
+if VERSION ≥ v"1.11.0-beta"
+    # https://github.com/JuliaLang/julia/issues/56295
+    # Julia 1.11's Base.Broadcast currently requires
+    # multiple integer indexing, wheras Julia 1.10 did not.
+    # This means that we cannot reserve linear indexing to
+    # special-case fixes for https://github.com/JuliaLang/julia/issues/28126
+    # (including the GPU-variant related issue resolution efforts:
+    # JuliaGPU/GPUArrays.jl#454, JuliaGPU/GPUArrays.jl#464).
+    function Base.copyto!(dest::AbstractData, bc, ::ToCUDA)
+        (_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
+        us = DataLayouts.UniversalSize(dest)
+        if Nv > 0 && Nh > 0
             args = (dest, bc, us)
             threads = threads_via_occupancy(knl_copyto!, args)
             n_max_threads = min(threads, get_N(us))
@@ -46,8 +39,43 @@ function Base.copyto!(dest::AbstractData, bc, ::ToCUDA)
                 blocks_s = p.blocks,
             )
         end
+        return dest
+    end
+else
+    function Base.copyto!(dest::AbstractData, bc, ::ToCUDA)
+        (_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
+        us = DataLayouts.UniversalSize(dest)
+        if Nv > 0 && Nh > 0
+            if DataLayouts.has_uniform_datalayouts(bc) &&
+               dest isa DataLayouts.EndsWithField
+                bc′ = Base.Broadcast.instantiate(
+                    DataLayouts.to_non_extruded_broadcasted(bc),
+                )
+                args = (dest, bc′, us)
+                threads = threads_via_occupancy(knl_copyto_linear!, args)
+                n_max_threads = min(threads, get_N(us))
+                p = linear_partition(prod(size(dest)), n_max_threads)
+                auto_launch!(
+                    knl_copyto_linear!,
+                    args;
+                    threads_s = p.threads,
+                    blocks_s = p.blocks,
+                )
+            else
+                args = (dest, bc, us)
+                threads = threads_via_occupancy(knl_copyto!, args)
+                n_max_threads = min(threads, get_N(us))
+                p = partition(dest, n_max_threads)
+                auto_launch!(
+                    knl_copyto!,
+                    args;
+                    threads_s = p.threads,
+                    blocks_s = p.blocks,
+                )
+            end
+        end
+        return dest
     end
-    return dest
 end
 
 # broadcasting scalar assignment