Merge branch 'main' into unified-memory-linalg

JuliaGPU · Oct 1, 2024 · 9a7d39f · 9a7d39f
2 parents eaa3283 + 438ab8f
commit 9a7d39f
Show file tree

Hide file tree

Showing 7 changed files with 118 additions and 102 deletions.
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -118,9 +118,7 @@ steps:
             build.message !~ /\[skip special\]/
         timeout_in_minutes: 60
 
-  # we want to benchmark every commit on the master branch, even if it failed CI
   - wait: ~
-    # continue_on_failure: true
 
   - group: ":racehorse: Benchmarks"
     steps:

diff --git a/.github/workflows/Benchmark.yml b/.github/workflows/Benchmark.yml
@@ -11,6 +11,7 @@ on:
       - main
     paths:
       - "src/**/*"
+      - "lib/**/*"
       - "ext/**/*"
       - "perf/**/*"
       - ".buildkite/**/*"
@@ -21,8 +22,9 @@ on:
       - main
     paths:
       - "src/**/*"
+      - "lib/**/*"
       - "ext/**/*"
-      - "benchmarks/**/*"
+      - "perf/**/*"
       - ".buildkite/**/*"
       - "Project.toml"
       - ".github/workflows/Benchmark.yml"

diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "Metal"
 uuid = "dde4c033-4e86-420c-a63e-0dd931031962"
-version = "1.3.0"
+version = "1.4.0"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"

diff --git a/lib/mps/linalg.jl b/lib/mps/linalg.jl
@@ -128,7 +128,7 @@ LinearAlgebra.ipiv2perm(v::MtlVector{T,S}, maxi::Integer) where {T,S<:MTL.CPUSto
         encode!(cbuf, kernel, descriptor)
     end
 
-    P = MtlMatrix{UInt32}(undef, 1, min(N, M))
+    P = similar(A, UInt32, 1, min(N, M))
     status = MtlArray{MPSMatrixDecompositionStatus,0,SharedStorage}(undef)
 
     commitAndContinue!(cmdbuf) do cbuf
@@ -137,7 +137,7 @@ LinearAlgebra.ipiv2perm(v::MtlVector{T,S}, maxi::Integer) where {T,S<:MTL.CPUSto
         encode!(cbuf, kernel, mps_at, mps_at, mps_p, status)
     end
 
-    B = MtlMatrix{T}(undef, M, N)
+    B = similar(A, M, N)
 
     commit!(cmdbuf) do cbuf
         mps_b = MPSMatrix(B)
@@ -186,7 +186,7 @@ end
         encode!(cbuf, kernel, descriptor)
     end
 
-    P = MtlMatrix{UInt32}(undef, 1, min(N, M))
+    P = similar(A, UInt32, 1, min(N, M))
     status = MtlArray{MPSMatrixDecompositionStatus,0,SharedStorage}(undef)
 
     commitAndContinue!(cmdbuf) do cbuf

diff --git a/perf/array.jl b/perf/array.jl
@@ -1,110 +1,112 @@
-group = addgroup!(SUITE, "array")
-
 const m = 512
 const n = 1000
 
-# generate some arrays
-cpu_mat = rand(rng, Float32, m, n)
-gpu_mat = MtlArray{Float32}(undef, size(cpu_mat))
-gpu_vec = reshape(gpu_mat, length(gpu_mat))
-gpu_arr_3d = reshape(gpu_mat, (m, 40, 25))
-gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10))
-gpu_mat_ints = MtlArray(rand(rng, Int, m, n))
-gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints))
-gpu_mat_bools = MtlArray(rand(rng, Bool, m, n))
-gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools))
-
-group["construct"] = @benchmarkable MtlArray{Int}(undef, 1)
-
-group["copy"] = @async_benchmarkable copy($gpu_mat)
-
-gpu_mat2 = copy(gpu_mat)
-let group = addgroup!(group, "copyto!")
-    group["cpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat, $cpu_mat)
-    group["gpu_to_cpu"] = @async_benchmarkable copyto!($cpu_mat, $gpu_mat)
-    group["gpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat2, $gpu_mat)
-end
+for (S, smname) in [(Metal.PrivateStorage,"private"), (Metal.SharedStorage,"shared")]
+    group = addgroup!(SUITE, "$smname array")
+
+    # generate some arrays
+    cpu_mat = rand(rng, Float32, m, n)
+    gpu_mat = MtlMatrix{Float32,S}(undef, size(cpu_mat))
+    gpu_vec = reshape(gpu_mat, length(gpu_mat))
+    gpu_arr_3d = reshape(gpu_mat, (m, 40, 25))
+    gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10))
+    gpu_mat_ints = MtlMatrix{Int,S}(rand(rng, Int, m, n))
+    gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints))
+    gpu_mat_bools = MtlMatrix{Bool,S}(rand(rng, Bool, m, n))
+    gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools))
+
+    group["construct"] = @benchmarkable MtlArray{Int,1,$S}(undef, 1)
+
+    group["copy"] = @benchmarkable Metal.@sync copy($gpu_mat)
+
+    gpu_mat2 = copy(gpu_mat)
+    let group = addgroup!(group, "copyto!")
+        group["cpu_to_gpu"] = @benchmarkable Metal.@sync copyto!($gpu_mat, $cpu_mat)
+        group["gpu_to_cpu"] = @benchmarkable Metal.@sync copyto!($cpu_mat, $gpu_mat)
+        group["gpu_to_gpu"] = @benchmarkable Metal.@sync copyto!($gpu_mat2, $gpu_mat)
+    end
 
-let group = addgroup!(group, "iteration")
-    group["scalar"] = @benchmarkable Metal.@allowscalar [$gpu_vec[i] for i in 1:10]
+    let group = addgroup!(group, "iteration")
+        group["scalar"] = @benchmarkable Metal.@allowscalar [$gpu_vec[i] for i in 1:10]
 
-    group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools]
+        group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools]
 
-    let group = addgroup!(group, "findall")
-        group["bool"] = @benchmarkable findall($gpu_vec_bools)
-        group["int"] = @benchmarkable findall(isodd, $gpu_vec_ints)
-    end
+        let group = addgroup!(group, "findall")
+            group["bool"] = @benchmarkable findall($gpu_vec_bools)
+            group["int"] = @benchmarkable findall(isodd, $gpu_vec_ints)
+        end
 
-    let group = addgroup!(group, "findfirst")
-        group["bool"] = @benchmarkable findfirst($gpu_vec_bools)
-        group["int"] = @benchmarkable findfirst(isodd, $gpu_vec_ints)
-    end
+        let group = addgroup!(group, "findfirst")
+            group["bool"] = @benchmarkable findfirst($gpu_vec_bools)
+            group["int"] = @benchmarkable findfirst(isodd, $gpu_vec_ints)
+        end
 
-    let group = addgroup!(group, "findmin") # findmax
-        group["1d"] = @async_benchmarkable findmin($gpu_vec)
-        group["2d"] = @async_benchmarkable findmin($gpu_mat; dims=1)
+        let group = addgroup!(group, "findmin") # findmax
+            group["1d"] = @benchmarkable Metal.@sync findmin($gpu_vec)
+            group["2d"] = @benchmarkable Metal.@sync findmin($gpu_mat; dims=1)
+        end
     end
-end
-
-# let group = addgroup!(group, "reverse")
-#     group["1d"] = @async_benchmarkable reverse($gpu_vec)
-#     group["2d"] = @async_benchmarkable reverse($gpu_mat; dims=1)
-#     group["1d_inplace"] = @async_benchmarkable reverse!($gpu_vec)
-#     group["2d_inplace"] = @async_benchmarkable reverse!($gpu_mat; dims=1)
-# end
 
-group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0
+    # let group = addgroup!(group, "reverse")
+    #     group["1d"] = @benchmarkable Metal.@sync reverse($gpu_vec)
+    #     group["2d"] = @benchmarkable Metal.@sync reverse($gpu_mat; dims=1)
+    #     group["1d_inplace"] = @benchmarkable Metal.@sync reverse!($gpu_vec)
+    #     group["2d_inplace"] = @benchmarkable Metal.@sync reverse!($gpu_mat; dims=1)
+    # end
 
-# no need to test inplace version, which performs the same operation (but with an alloc)
-let group = addgroup!(group, "accumulate")
-    group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec)
-    group["2d"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=1)
-end
+    group["broadcast"] = @benchmarkable Metal.@sync $gpu_mat .= 0f0
 
-let group = addgroup!(group, "reductions")
-    let group = addgroup!(group, "reduce")
-        group["1d"] = @async_benchmarkable reduce(+, $gpu_vec)
-        group["2d"] = @async_benchmarkable reduce(+, $gpu_mat; dims=1)
+    # no need to test inplace version, which performs the same operation (but with an alloc)
+    let group = addgroup!(group, "accumulate")
+        group["1d"] = @benchmarkable Metal.@sync accumulate(+, $gpu_vec)
+        group["2d"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims=1)
     end
 
-    let group = addgroup!(group, "mapreduce")
-        group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec)
-        group["2d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=1)
-    end
+    let group = addgroup!(group, "reductions")
+        let group = addgroup!(group, "reduce")
+            group["1d"] = @benchmarkable Metal.@sync reduce(+, $gpu_vec)
+            group["2d"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims=1)
+        end
 
-    # used by sum, prod, minimum, maximum, all, any, count
-end
+        let group = addgroup!(group, "mapreduce")
+            group["1d"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_vec)
+            group["2d"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat; dims=1)
+        end
 
-let group = addgroup!(group, "random")
-    let group = addgroup!(group, "rand")
-        group["Float32"] = @async_benchmarkable Metal.rand(Float32, m*n)
-        group["Int64"] = @async_benchmarkable Metal.rand(Int64, m*n)
+        # used by sum, prod, minimum, maximum, all, any, count
     end
 
-    let group = addgroup!(group, "rand!")
-        group["Float32"] = @async_benchmarkable Metal.rand!($gpu_vec)
-        group["Int64"] = @async_benchmarkable Metal.rand!($gpu_vec_ints)
+    let group = addgroup!(group, "random")
+        let group = addgroup!(group, "rand")
+            group["Float32"] = @benchmarkable Metal.@sync Metal.rand(Float32, m*n)
+            group["Int64"] = @benchmarkable Metal.@sync Metal.rand(Int64, m*n)
+        end
+
+        let group = addgroup!(group, "rand!")
+            group["Float32"] = @benchmarkable Metal.@sync Metal.rand!($gpu_vec)
+            group["Int64"] = @benchmarkable Metal.@sync Metal.rand!($gpu_vec_ints)
+        end
+
+        let group = addgroup!(group, "randn")
+            group["Float32"] = @benchmarkable Metal.@sync Metal.randn(Float32, m*n)
+            # group["Int64"] = @benchmarkable Metal.@sync Metal.randn(Int64, m*n)
+        end
+
+        let group = addgroup!(group, "randn!")
+            group["Float32"] = @benchmarkable Metal.@sync Metal.randn!($gpu_vec)
+            # group["Int64"] = @benchmarkable Metal.@sync Metal.randn!($gpu_vec_ints)
+        end
     end
 
-    let group = addgroup!(group, "randn")
-        group["Float32"] = @async_benchmarkable Metal.randn(Float32, m*n)
-        # group["Int64"] = @async_benchmarkable Metal.randn(Int64, m*n)
-    end
+    # let group = addgroup!(group, "sorting")
+    #     group["1d"] = @benchmarkable Metal.@sync sort($gpu_vec)
+    #     group["2d"] = @benchmarkable Metal.@sync sort($gpu_mat; dims=1)
+    #     group["by"] = @benchmarkable Metal.@sync sort($gpu_vec; by=sin)
+    # end
 
-    let group = addgroup!(group, "randn!")
-        group["Float32"] = @async_benchmarkable Metal.randn!($gpu_vec)
-        # group["Int64"] = @async_benchmarkable Metal.randn!($gpu_vec_ints)
+    let group = addgroup!(group, "permutedims")
+        group["2d"] = @benchmarkable Metal.@sync permutedims($gpu_mat, (2,1))
+        group["3d"] = @benchmarkable Metal.@sync permutedims($gpu_arr_3d, (3,1,2))
+        group["4d"] = @benchmarkable Metal.@sync permutedims($gpu_arr_4d, (2,1,4,3))
     end
 end
-
-# let group = addgroup!(group, "sorting")
-#     group["1d"] = @async_benchmarkable sort($gpu_vec)
-#     group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1)
-#     group["by"] = @async_benchmarkable sort($gpu_vec; by=sin)
-# end
-
-let group = addgroup!(group, "permutedims")
-    group["2d"] = @async_benchmarkable permutedims($gpu_mat, (2,1))
-    group["3d"] = @async_benchmarkable permutedims($gpu_arr_3d, (3,1,2))
-    group["4d"] = @async_benchmarkable permutedims($gpu_arr_4d, (2,1,4,3))
-end
diff --git a/test/array.jl b/test/array.jl
@@ -210,28 +210,34 @@ end
 
     # Dims in tuple
     let A = Metal.fill(b, (10, 10, 10, 1000))
-        @test all(Array(A) .== b)
+        B = fill(b, (10, 10, 10, 1000))
+        @test Array(A) == B
     end
 
-    let M = Metal.fill(b, (10, 10))
-        @test all(Array(M) .== b)
+    let M = Metal.fill(b, (10, 10, 10, 1000))
+        B = fill(b, (10, 10, 10, 1000))
+        @test Array(M) == B
     end
 
     let V = Metal.fill(b, (10,))
-        @test all(Array(V) .== b)
+        B = fill(b, (10,))
+        @test Array(V) == B
     end
 
     #Dims already unpacked
     let A = Metal.fill(b, 10, 10, 10, 1000)
-        @test all(Array(A) .== b)
+        B = fill(b, 10, 10, 10, 1000)
+        @test Array(A) == B
     end
 
     let M = Metal.fill(b, 10, 10)
-        @test all(Array(M) .== b)
+        B = fill(b, 10, 10)
+        @test Array(M) == B
     end
 
     let V = Metal.fill(b, 10)
-        @test all(Array(V) .== b)
+        B = fill(b, 10)
+        @test Array(V) == B
     end
 end
 
@@ -420,7 +426,7 @@ end
 
 @testset "broadcast" begin
     testf(f, x) = Array(f(MtlArray(x))) ≈ f(x)
-    
+
     @test testf(x->max.(x, zero(Float32)), randn(Float32, 1000))
     @test testf(x->min.(x, one(Float32)), randn(Float32, 1000))
     @test testf(x->min.(max.(x, zero(Float32)), one(Float32)), randn(Float32, 1000))

diff --git a/test/mps/linalg.jl b/test/mps/linalg.jl
@@ -190,6 +190,7 @@ end
     end
 end
 
+using Metal: storagemode
 @testset "decompositions" begin
     A = MtlMatrix(rand(Float32, 1024, 1024))
     lua = lu(A)
@@ -211,6 +212,13 @@ end
 
     A = MtlMatrix{Float32}([1 2; 0 0])
     @test_throws SingularException lu(A)
+
+    altStorage = Metal.DefaultStorageMode != Metal.PrivateStorage ? Metal.PrivateStorage : Metal.SharedStorage
+    A = MtlMatrix{Float32,altStorage}(rand(Float32, 1024, 1024))
+    lua = lu(A)
+    @test storagemode(lua.factors) == storagemode(lua.ipiv) == storagemode(A)
+    lua = lu!(A)
+    @test storagemode(lua.factors) == storagemode(lua.ipiv) == storagemode(A)
 end
 
 using .MPS: MPSMatrixSoftMax, MPSMatrixLogSoftMax