Skip to content

Commit

Permalink
Merge branch 'main' into unified-memory-linalg
Browse files Browse the repository at this point in the history
  • Loading branch information
christiangnrd authored Oct 1, 2024
2 parents eaa3283 + 438ab8f commit 9a7d39f
Show file tree
Hide file tree
Showing 7 changed files with 118 additions and 102 deletions.
2 changes: 0 additions & 2 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,7 @@ steps:
build.message !~ /\[skip special\]/
timeout_in_minutes: 60

# we want to benchmark every commit on the master branch, even if it failed CI
- wait: ~
# continue_on_failure: true

- group: ":racehorse: Benchmarks"
steps:
Expand Down
4 changes: 3 additions & 1 deletion .github/workflows/Benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ on:
- main
paths:
- "src/**/*"
- "lib/**/*"
- "ext/**/*"
- "perf/**/*"
- ".buildkite/**/*"
Expand All @@ -21,8 +22,9 @@ on:
- main
paths:
- "src/**/*"
- "lib/**/*"
- "ext/**/*"
- "benchmarks/**/*"
- "perf/**/*"
- ".buildkite/**/*"
- "Project.toml"
- ".github/workflows/Benchmark.yml"
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "Metal"
uuid = "dde4c033-4e86-420c-a63e-0dd931031962"
version = "1.3.0"
version = "1.4.0"

[deps]
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
Expand Down
6 changes: 3 additions & 3 deletions lib/mps/linalg.jl
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ LinearAlgebra.ipiv2perm(v::MtlVector{T,S}, maxi::Integer) where {T,S<:MTL.CPUSto
encode!(cbuf, kernel, descriptor)
end

P = MtlMatrix{UInt32}(undef, 1, min(N, M))
P = similar(A, UInt32, 1, min(N, M))
status = MtlArray{MPSMatrixDecompositionStatus,0,SharedStorage}(undef)

commitAndContinue!(cmdbuf) do cbuf
Expand All @@ -137,7 +137,7 @@ LinearAlgebra.ipiv2perm(v::MtlVector{T,S}, maxi::Integer) where {T,S<:MTL.CPUSto
encode!(cbuf, kernel, mps_at, mps_at, mps_p, status)
end

B = MtlMatrix{T}(undef, M, N)
B = similar(A, M, N)

commit!(cmdbuf) do cbuf
mps_b = MPSMatrix(B)
Expand Down Expand Up @@ -186,7 +186,7 @@ end
encode!(cbuf, kernel, descriptor)
end

P = MtlMatrix{UInt32}(undef, 1, min(N, M))
P = similar(A, UInt32, 1, min(N, M))
status = MtlArray{MPSMatrixDecompositionStatus,0,SharedStorage}(undef)

commitAndContinue!(cmdbuf) do cbuf
Expand Down
176 changes: 89 additions & 87 deletions perf/array.jl
Original file line number Diff line number Diff line change
@@ -1,110 +1,112 @@
group = addgroup!(SUITE, "array")

const m = 512
const n = 1000

# generate some arrays
cpu_mat = rand(rng, Float32, m, n)
gpu_mat = MtlArray{Float32}(undef, size(cpu_mat))
gpu_vec = reshape(gpu_mat, length(gpu_mat))
gpu_arr_3d = reshape(gpu_mat, (m, 40, 25))
gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10))
gpu_mat_ints = MtlArray(rand(rng, Int, m, n))
gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints))
gpu_mat_bools = MtlArray(rand(rng, Bool, m, n))
gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools))

group["construct"] = @benchmarkable MtlArray{Int}(undef, 1)

group["copy"] = @async_benchmarkable copy($gpu_mat)

gpu_mat2 = copy(gpu_mat)
let group = addgroup!(group, "copyto!")
group["cpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat, $cpu_mat)
group["gpu_to_cpu"] = @async_benchmarkable copyto!($cpu_mat, $gpu_mat)
group["gpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat2, $gpu_mat)
end
for (S, smname) in [(Metal.PrivateStorage,"private"), (Metal.SharedStorage,"shared")]
group = addgroup!(SUITE, "$smname array")

# generate some arrays
cpu_mat = rand(rng, Float32, m, n)
gpu_mat = MtlMatrix{Float32,S}(undef, size(cpu_mat))
gpu_vec = reshape(gpu_mat, length(gpu_mat))
gpu_arr_3d = reshape(gpu_mat, (m, 40, 25))
gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10))
gpu_mat_ints = MtlMatrix{Int,S}(rand(rng, Int, m, n))
gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints))
gpu_mat_bools = MtlMatrix{Bool,S}(rand(rng, Bool, m, n))
gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools))

group["construct"] = @benchmarkable MtlArray{Int,1,$S}(undef, 1)

group["copy"] = @benchmarkable Metal.@sync copy($gpu_mat)

gpu_mat2 = copy(gpu_mat)
let group = addgroup!(group, "copyto!")
group["cpu_to_gpu"] = @benchmarkable Metal.@sync copyto!($gpu_mat, $cpu_mat)
group["gpu_to_cpu"] = @benchmarkable Metal.@sync copyto!($cpu_mat, $gpu_mat)
group["gpu_to_gpu"] = @benchmarkable Metal.@sync copyto!($gpu_mat2, $gpu_mat)
end

let group = addgroup!(group, "iteration")
group["scalar"] = @benchmarkable Metal.@allowscalar [$gpu_vec[i] for i in 1:10]
let group = addgroup!(group, "iteration")
group["scalar"] = @benchmarkable Metal.@allowscalar [$gpu_vec[i] for i in 1:10]

group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools]
group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools]

let group = addgroup!(group, "findall")
group["bool"] = @benchmarkable findall($gpu_vec_bools)
group["int"] = @benchmarkable findall(isodd, $gpu_vec_ints)
end
let group = addgroup!(group, "findall")
group["bool"] = @benchmarkable findall($gpu_vec_bools)
group["int"] = @benchmarkable findall(isodd, $gpu_vec_ints)
end

let group = addgroup!(group, "findfirst")
group["bool"] = @benchmarkable findfirst($gpu_vec_bools)
group["int"] = @benchmarkable findfirst(isodd, $gpu_vec_ints)
end
let group = addgroup!(group, "findfirst")
group["bool"] = @benchmarkable findfirst($gpu_vec_bools)
group["int"] = @benchmarkable findfirst(isodd, $gpu_vec_ints)
end

let group = addgroup!(group, "findmin") # findmax
group["1d"] = @async_benchmarkable findmin($gpu_vec)
group["2d"] = @async_benchmarkable findmin($gpu_mat; dims=1)
let group = addgroup!(group, "findmin") # findmax
group["1d"] = @benchmarkable Metal.@sync findmin($gpu_vec)
group["2d"] = @benchmarkable Metal.@sync findmin($gpu_mat; dims=1)
end
end
end

# let group = addgroup!(group, "reverse")
# group["1d"] = @async_benchmarkable reverse($gpu_vec)
# group["2d"] = @async_benchmarkable reverse($gpu_mat; dims=1)
# group["1d_inplace"] = @async_benchmarkable reverse!($gpu_vec)
# group["2d_inplace"] = @async_benchmarkable reverse!($gpu_mat; dims=1)
# end

group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0
# let group = addgroup!(group, "reverse")
# group["1d"] = @benchmarkable Metal.@sync reverse($gpu_vec)
# group["2d"] = @benchmarkable Metal.@sync reverse($gpu_mat; dims=1)
# group["1d_inplace"] = @benchmarkable Metal.@sync reverse!($gpu_vec)
# group["2d_inplace"] = @benchmarkable Metal.@sync reverse!($gpu_mat; dims=1)
# end

# no need to test inplace version, which performs the same operation (but with an alloc)
let group = addgroup!(group, "accumulate")
group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec)
group["2d"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=1)
end
group["broadcast"] = @benchmarkable Metal.@sync $gpu_mat .= 0f0

let group = addgroup!(group, "reductions")
let group = addgroup!(group, "reduce")
group["1d"] = @async_benchmarkable reduce(+, $gpu_vec)
group["2d"] = @async_benchmarkable reduce(+, $gpu_mat; dims=1)
# no need to test inplace version, which performs the same operation (but with an alloc)
let group = addgroup!(group, "accumulate")
group["1d"] = @benchmarkable Metal.@sync accumulate(+, $gpu_vec)
group["2d"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims=1)
end

let group = addgroup!(group, "mapreduce")
group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec)
group["2d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=1)
end
let group = addgroup!(group, "reductions")
let group = addgroup!(group, "reduce")
group["1d"] = @benchmarkable Metal.@sync reduce(+, $gpu_vec)
group["2d"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims=1)
end

# used by sum, prod, minimum, maximum, all, any, count
end
let group = addgroup!(group, "mapreduce")
group["1d"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_vec)
group["2d"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat; dims=1)
end

let group = addgroup!(group, "random")
let group = addgroup!(group, "rand")
group["Float32"] = @async_benchmarkable Metal.rand(Float32, m*n)
group["Int64"] = @async_benchmarkable Metal.rand(Int64, m*n)
# used by sum, prod, minimum, maximum, all, any, count
end

let group = addgroup!(group, "rand!")
group["Float32"] = @async_benchmarkable Metal.rand!($gpu_vec)
group["Int64"] = @async_benchmarkable Metal.rand!($gpu_vec_ints)
let group = addgroup!(group, "random")
let group = addgroup!(group, "rand")
group["Float32"] = @benchmarkable Metal.@sync Metal.rand(Float32, m*n)
group["Int64"] = @benchmarkable Metal.@sync Metal.rand(Int64, m*n)
end

let group = addgroup!(group, "rand!")
group["Float32"] = @benchmarkable Metal.@sync Metal.rand!($gpu_vec)
group["Int64"] = @benchmarkable Metal.@sync Metal.rand!($gpu_vec_ints)
end

let group = addgroup!(group, "randn")
group["Float32"] = @benchmarkable Metal.@sync Metal.randn(Float32, m*n)
# group["Int64"] = @benchmarkable Metal.@sync Metal.randn(Int64, m*n)
end

let group = addgroup!(group, "randn!")
group["Float32"] = @benchmarkable Metal.@sync Metal.randn!($gpu_vec)
# group["Int64"] = @benchmarkable Metal.@sync Metal.randn!($gpu_vec_ints)
end
end

let group = addgroup!(group, "randn")
group["Float32"] = @async_benchmarkable Metal.randn(Float32, m*n)
# group["Int64"] = @async_benchmarkable Metal.randn(Int64, m*n)
end
# let group = addgroup!(group, "sorting")
# group["1d"] = @benchmarkable Metal.@sync sort($gpu_vec)
# group["2d"] = @benchmarkable Metal.@sync sort($gpu_mat; dims=1)
# group["by"] = @benchmarkable Metal.@sync sort($gpu_vec; by=sin)
# end

let group = addgroup!(group, "randn!")
group["Float32"] = @async_benchmarkable Metal.randn!($gpu_vec)
# group["Int64"] = @async_benchmarkable Metal.randn!($gpu_vec_ints)
let group = addgroup!(group, "permutedims")
group["2d"] = @benchmarkable Metal.@sync permutedims($gpu_mat, (2,1))
group["3d"] = @benchmarkable Metal.@sync permutedims($gpu_arr_3d, (3,1,2))
group["4d"] = @benchmarkable Metal.@sync permutedims($gpu_arr_4d, (2,1,4,3))
end
end

# let group = addgroup!(group, "sorting")
# group["1d"] = @async_benchmarkable sort($gpu_vec)
# group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1)
# group["by"] = @async_benchmarkable sort($gpu_vec; by=sin)
# end

let group = addgroup!(group, "permutedims")
group["2d"] = @async_benchmarkable permutedims($gpu_mat, (2,1))
group["3d"] = @async_benchmarkable permutedims($gpu_arr_3d, (3,1,2))
group["4d"] = @async_benchmarkable permutedims($gpu_arr_4d, (2,1,4,3))
end
22 changes: 14 additions & 8 deletions test/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -210,28 +210,34 @@ end

# Dims in tuple
let A = Metal.fill(b, (10, 10, 10, 1000))
@test all(Array(A) .== b)
B = fill(b, (10, 10, 10, 1000))
@test Array(A) == B
end

let M = Metal.fill(b, (10, 10))
@test all(Array(M) .== b)
let M = Metal.fill(b, (10, 10, 10, 1000))
B = fill(b, (10, 10, 10, 1000))
@test Array(M) == B
end

let V = Metal.fill(b, (10,))
@test all(Array(V) .== b)
B = fill(b, (10,))
@test Array(V) == B
end

#Dims already unpacked
let A = Metal.fill(b, 10, 10, 10, 1000)
@test all(Array(A) .== b)
B = fill(b, 10, 10, 10, 1000)
@test Array(A) == B
end

let M = Metal.fill(b, 10, 10)
@test all(Array(M) .== b)
B = fill(b, 10, 10)
@test Array(M) == B
end

let V = Metal.fill(b, 10)
@test all(Array(V) .== b)
B = fill(b, 10)
@test Array(V) == B
end
end

Expand Down Expand Up @@ -420,7 +426,7 @@ end

@testset "broadcast" begin
testf(f, x) = Array(f(MtlArray(x))) f(x)

@test testf(x->max.(x, zero(Float32)), randn(Float32, 1000))
@test testf(x->min.(x, one(Float32)), randn(Float32, 1000))
@test testf(x->min.(max.(x, zero(Float32)), one(Float32)), randn(Float32, 1000))
Expand Down
8 changes: 8 additions & 0 deletions test/mps/linalg.jl
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ end
end
end

using Metal: storagemode
@testset "decompositions" begin
A = MtlMatrix(rand(Float32, 1024, 1024))
lua = lu(A)
Expand All @@ -211,6 +212,13 @@ end

A = MtlMatrix{Float32}([1 2; 0 0])
@test_throws SingularException lu(A)

altStorage = Metal.DefaultStorageMode != Metal.PrivateStorage ? Metal.PrivateStorage : Metal.SharedStorage
A = MtlMatrix{Float32,altStorage}(rand(Float32, 1024, 1024))
lua = lu(A)
@test storagemode(lua.factors) == storagemode(lua.ipiv) == storagemode(A)
lua = lu!(A)
@test storagemode(lua.factors) == storagemode(lua.ipiv) == storagemode(A)
end

using .MPS: MPSMatrixSoftMax, MPSMatrixLogSoftMax
Expand Down

0 comments on commit 9a7d39f

Please sign in to comment.