Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create MtlArray using memory allocated by Array #320

Merged
merged 3 commits into from
Apr 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 30 additions & 3 deletions lib/mtl/buffer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,17 +34,36 @@ function MTLBuffer(dev::Union{MTLDevice,MTLHeap}, bytesize::Integer;
end

function MTLBuffer(dev::MTLDevice, bytesize::Integer, ptr::Ptr;
storage=Managed, hazard_tracking=DefaultTracking,
nocopy=false, storage=Shared, hazard_tracking=DefaultTracking,
cache_mode=DefaultCPUCache)
storage == Private && error("Can't create a Private copy-allocated buffer.")
storage == Private && error("Cannot allocate-and-initialize a Private buffer")
opts = convert(MTLResourceOptions, storage) | hazard_tracking | cache_mode

@assert 0 < bytesize <= dev.maxBufferLength
ptr = alloc_buffer(dev, bytesize, opts, ptr)
ptr = if nocopy
alloc_buffer_nocopy(dev, bytesize, opts, ptr)
else
alloc_buffer(dev, bytesize, opts, ptr)
end

return MTLBuffer(ptr)
end

const PAGESIZE = ccall(:getpagesize, Cint, ())
function can_alloc_nocopy(ptr::Ptr, bytesize::Integer)
# newBufferWithBytesNoCopy has several restrictions:
## the pointer has to be page-aligned
if Int64(ptr) % PAGESIZE != 0
return false
end
## the new buffer needs to be page-aligned
## XXX: on macOS 14, this doesn't seem required; is this a documentation issue?
if bytesize % PAGESIZE != 0
return false
end
return true
end

# from device
alloc_buffer(dev::MTLDevice, bytesize, opts) =
@objc [dev::id{MTLDevice} newBufferWithLength:bytesize::NSUInteger
Expand All @@ -53,6 +72,14 @@ alloc_buffer(dev::MTLDevice, bytesize, opts, ptr::Ptr) =
@objc [dev::id{MTLDevice} newBufferWithBytes:ptr::Ptr{Cvoid}
length:bytesize::NSUInteger
options:opts::MTLResourceOptions]::id{MTLBuffer}
function alloc_buffer_nocopy(dev::MTLDevice, bytesize, opts, ptr::Ptr)
can_alloc_nocopy(ptr, bytesize) ||
throw(ArgumentError("Cannot allocate nocopy buffer from non-aligned memory"))
@objc [dev::id{MTLDevice} newBufferWithBytesNoCopy:ptr::Ptr{Cvoid}
length:bytesize::NSUInteger
options:opts::MTLResourceOptions
deallocator:nil::id{Object}]::id{MTLBuffer}
end

# from heap
alloc_buffer(dev::MTLHeap, bytesize, opts) =
Expand Down
16 changes: 16 additions & 0 deletions src/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,14 @@ mutable struct MtlArray{T,N,S} <: AbstractGPUArray{T,N}
end
end

# Create MtlArray from MTLBuffer
function MtlArray{T,N}(buf::B, dims::Dims{N}; kwargs...) where {B<:MTLBuffer,T,N}
data = DataRef(buf) do buf
free(buf)
end
return MtlArray{T,N}(data, dims; kwargs...)
end

unsafe_free!(a::MtlArray) = GPUArrays.unsafe_free!(a.data)

device(A::MtlArray) = A.data[].device
Expand Down Expand Up @@ -491,6 +499,14 @@ function Base.unsafe_wrap(t::Type{<:Array{T}}, ptr::MtlPtr{T}, dims; own=false)
return unsafe_wrap(t, convert(Ptr{T}, ptr), dims; own)
end

function Base.unsafe_wrap(A::Type{<:MtlArray{T,N}}, arr::Array, dims=size(arr);
dev=current_device(), kwargs...) where {T,N}
GC.@preserve arr begin
buf = MTLBuffer(dev, prod(dims) * sizeof(T), pointer(arr); nocopy=true, kwargs...)
return A(buf, Dims(dims))
end
end

## resizing

"""
Expand Down
31 changes: 21 additions & 10 deletions src/memory.jl
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,12 @@ function Base.unsafe_copyto!(dev::MTLDevice, dst::MtlPtr{T}, src::Ptr{T}, N::Int
storage_type = dst.buffer.storageMode
if storage_type == MTL.MTLStorageModePrivate
# stage through a shared buffer
# shared = alloc(dev, N*sizeof(T), src; storage=Shared)
# unsafe_copyto!(dev, dst, pointer(shared), N; queue, async=false)
# free(shared)
tmp_buf = alloc(dev, N*sizeof(T), src; storage=Shared) #CPU -> GPU (Shared)
unsafe_copyto!(dev, MtlPtr{T}(dst.buffer, dst.offset), MtlPtr{T}(tmp_buf, 0), N; queue, async=false) # GPU (Shared) -> GPU (Private)
nocopy = MTL.can_alloc_nocopy(src, N*sizeof(T))
tmp_buf = alloc(dev, N*sizeof(T), src; storage=Shared, nocopy)

# copy to the private buffer
unsafe_copyto!(dev, MtlPtr{T}(dst.buffer, dst.offset), MtlPtr{T}(tmp_buf, 0), N;
queue, async=(nocopy && async))
free(tmp_buf)
christiangnrd marked this conversation as resolved.
Show resolved Hide resolved
elseif storage_type == MTL.MTLStorageModeShared
unsafe_copyto!(convert(Ptr{T}, dst), src, N)
Expand All @@ -54,12 +55,22 @@ end
function Base.unsafe_copyto!(dev::MTLDevice, dst::Ptr{T}, src::MtlPtr{T}, N::Integer;
queue::MTLCommandQueue=global_queue(dev), async::Bool=false) where T
storage_type = src.buffer.storageMode
if storage_type == MTL.MTLStorageModePrivate
if storage_type == MTL.MTLStorageModePrivate
# stage through a shared buffer
shared = alloc(dev, N*sizeof(T); storage=Shared)
unsafe_copyto!(dev, MtlPtr{T}(shared, 0), MtlPtr{T}(src.buffer, src.offset), N; queue, async=false)
unsafe_copyto!(dst, convert(Ptr{T}, shared), N)
free(shared)
nocopy = MTL.can_alloc_nocopy(dst, N*sizeof(T))
tmp_buf = if nocopy
alloc(dev, N*sizeof(T), dst; storage=Shared, nocopy)
else
alloc(dev, N*sizeof(T); storage=Shared)
end
unsafe_copyto!(dev, MtlPtr{T}(tmp_buf, 0), MtlPtr{T}(src.buffer, src.offset), N;
queue, async=(nocopy && async))

# copy from the shared buffer
if !nocopy
unsafe_copyto!(dst, convert(Ptr{T}, tmp_buf), N)
end
free(tmp_buf)
elseif storage_type == MTL.MTLStorageModeShared
unsafe_copyto!(dst, convert(Ptr{T}, src), N)
elseif storage_type == MTL.MTLStorageModeManaged
Expand Down
48 changes: 48 additions & 0 deletions test/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -305,4 +305,52 @@ end
@test length(b) == 1
end

function _alignedvec(::Type{T}, n::Integer, alignment::Integer=16384) where {T}
ispow2(alignment) || throw(ArgumentError("$alignment is not a power of 2"))
alignment ≥ sizeof(Int) || throw(ArgumentError("$alignment is not a multiple of $(sizeof(Int))"))
isbitstype(T) || throw(ArgumentError("$T is not a bitstype"))
p = Ref{Ptr{T}}()
err = ccall(:posix_memalign, Cint, (Ref{Ptr{T}}, Csize_t, Csize_t), p, alignment, n*sizeof(T))
iszero(err) || throw(OutOfMemoryError())
return unsafe_wrap(Array, p[], n, own=true)
end

@testset "unsafe_wrap" begin
# Create page-aligned vector for testing
arr1 = _alignedvec(Float32, 16384*2);
fill!(arr1, zero(eltype(arr1)))
marr1 = unsafe_wrap(MtlVector{Float32}, arr1);

@test all(arr1 .== 0)
@test all(marr1 .== 0)

# XXX: Test fails when ordered as shown
# @test all(arr1 .== 1)
# @test all(marr1 .== 1)
marr1 .+= 1;
@test all(marr1 .== 1)
@test all(arr1 .== 1)

arr1 .+= 1;
@test all(marr1 .== 2)
@test all(arr1 .== 2)

marr2 = Metal.zeros(Float32, 18000; storage=Shared);
arr2 = unsafe_wrap(Vector{Float32}, marr2);

@test all(arr2 .== 0)
@test all(marr2 .== 0)

# XXX: Test fails when ordered as shown
# @test all(arr2 .== 1)
# @test all(marr2 .== 1)
marr2 .+= 1;
@test all(marr2 .== 1)
@test all(arr2 .== 1)

arr2 .+= 1;
@test all(arr2 .== 2)
@test all(marr2 .== 2)
end

end