From 63379de799960ec2a0af62771c0d82adadda6ae8 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Thu, 26 Sep 2024 10:38:08 -0300
Subject: [PATCH 1/5] Move BFloat16 code out of extension

---
 Project.toml        |  3 +--
 ext/BFloat16sExt.jl | 14 --------------
 lib/mps/MPS.jl      |  2 ++
 lib/mps/matrix.jl   |  2 +-
 4 files changed, 4 insertions(+), 17 deletions(-)
 delete mode 100644 ext/BFloat16sExt.jl

diff --git a/Project.toml b/Project.toml
index 4e7f1bd88..90cb06fb4 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,6 +5,7 @@ version = "1.4.0"
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Artifacts = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
+BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
 CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82"
 CodecBzip2 = "523fee87-0ab8-5b00-afb7-3ecf72e48cfd"
 ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
@@ -26,11 +27,9 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 
 [weakdeps]
-BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 
 [extensions]
-BFloat16sExt = "BFloat16s"
 SpecialFunctionsExt = "SpecialFunctions"
 
 [compat]
diff --git a/ext/BFloat16sExt.jl b/ext/BFloat16sExt.jl
deleted file mode 100644
index 6f11b18fc..000000000
--- a/ext/BFloat16sExt.jl
+++ /dev/null
@@ -1,14 +0,0 @@
-module BFloat16sExt
-
-using Metal: MPS.MPSDataType, MPS.MPSDataTypeBFloat16, MPS.jl_mps_to_typ, macos_version
-using BFloat16s
-
-# BFloat is only supported in MPS starting in MacOS 14
-@static if Sys.isapple()
-    if macos_version() >= v"14"
-        Base.convert(::Type{MPSDataType}, ::Type{BFloat16}) = MPSDataTypeBFloat16
-        jl_mps_to_typ[MPSDataTypeBFloat16] = BFloat16
-    end
-end
-
-end # module
diff --git a/lib/mps/MPS.jl b/lib/mps/MPS.jl
index 7266eae9a..09f7754b7 100644
--- a/lib/mps/MPS.jl
+++ b/lib/mps/MPS.jl
@@ -16,6 +16,8 @@ using ObjectiveC, .Foundation
 
 import GPUArrays
 
+using BFloat16s
+
 const MtlFloat = Union{Float32, Float16}
 
 is_supported(dev::MTLDevice) = ccall(:MPSSupportsMTLDevice, Bool, (id{MTLDevice},), dev)
diff --git a/lib/mps/matrix.jl b/lib/mps/matrix.jl
index 9eb86d75b..295d63c53 100644
--- a/lib/mps/matrix.jl
+++ b/lib/mps/matrix.jl
@@ -38,7 +38,7 @@ Base.convert(::Type{MPSDataType}, x::Integer) = MPSDataType(x)
 
 # Conversions for MPSDataTypes with Julia equivalents
 const jl_mps_to_typ = Dict{MPSDataType, DataType}()
-for type in [UInt8,UInt16,UInt32,UInt64,Int8,Int16,Int32,Int64,Float16,Float32,ComplexF16,ComplexF32,Bool]
+for type in [:UInt8,:UInt16,:UInt32,:UInt64,:Int8,:Int16,:Int32,:Int64,:Float16,:BFloat16,:Float32,:ComplexF16,:ComplexF32,:Bool]
     @eval Base.convert(::Type{MPSDataType}, ::Type{$type}) = $(Symbol(:MPSDataType, type))
     @eval jl_mps_to_typ[$(Symbol(:MPSDataType, type))] = $type
 end

From 2b1e7a27ea84a45cecc3460ed1a3ae894dc915a4 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Thu, 26 Sep 2024 12:05:38 -0300
Subject: [PATCH 2/5] Initial BFloat16 support

---
 lib/mps/MPS.jl                |  4 ++--
 src/Metal.jl                  |  1 +
 src/compiler/compilation.jl   |  3 ++-
 src/device/intrinsics/simd.jl |  3 ++-
 test/device/intrinsics.jl     | 16 ++++++++++------
 5 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/lib/mps/MPS.jl b/lib/mps/MPS.jl
index 09f7754b7..b2874e26d 100644
--- a/lib/mps/MPS.jl
+++ b/lib/mps/MPS.jl
@@ -16,9 +16,9 @@ using ObjectiveC, .Foundation
 
 import GPUArrays
 
-using BFloat16s
+using BFloat16s: BFloat16
 
-const MtlFloat = Union{Float32, Float16}
+const MtlFloat = Union{Float32, Float16, BFloat16}
 
 is_supported(dev::MTLDevice) = ccall(:MPSSupportsMTLDevice, Bool, (id{MTLDevice},), dev)
 
diff --git a/src/Metal.jl b/src/Metal.jl
index b63a69910..1a51456c2 100644
--- a/src/Metal.jl
+++ b/src/Metal.jl
@@ -13,6 +13,7 @@ using ExprTools: splitdef, combinedef
 using Artifacts
 using ObjectiveC, .CoreFoundation, .Foundation, .Dispatch, .OS
 import KernelAbstractions
+using BFloat16s
 
 include("version.jl")
 
diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl
index 7c4284ca8..c8fede871 100644
--- a/src/compiler/compilation.jl
+++ b/src/compiler/compilation.jl
@@ -18,7 +18,8 @@ function GPUCompiler.finish_ir!(@nospecialize(job::MetalCompilerJob),
     # pointer type information for typed intrinsics
     # (this is consumed by the LLVM IR downgrader)
     for (jltyp, llvmtyp) in (Int32 => :i32, Int64 => :i64,
-                             Float16 => :f16, Float32 => :f32),
+                             Float16 => :f16, Float32 => :f32,
+                             BFloat16 => :bf16),
         (as, asname) in (AS.Device => "global", AS.ThreadGroup => "local")
 
         # map of intrinsics to pointer operand indices and eltypes
diff --git a/src/device/intrinsics/simd.jl b/src/device/intrinsics/simd.jl
index 79250e330..e8815797d 100644
--- a/src/device/intrinsics/simd.jl
+++ b/src/device/intrinsics/simd.jl
@@ -7,7 +7,7 @@ function convert_origin(origin::NTuple{2, Int64})
     return (VecElement{Int64}(origin[1]-1), VecElement{Int64}(origin[2]-1))
 end
 
-for (jltype, suffix) in ((:Float16, "f16"), (:Float32, "f32"))
+for (jltype, suffix) in ((:Float16, "f16"), (:Float32, "f32"), (:BFloat16, "bf18"))
     for as in (AS.Device, AS.ThreadGroup)
         @eval begin
             @device_function simdgroup_load(
@@ -88,6 +88,7 @@ Returns `a * b + c`.
 
 simd_shuffle_map = ((Float32, "f32"),
                     (Float16, "f16"),
+                    (BFloat16, "bf16"),
                     (Int32,   "s.i32"),
                     (UInt32,  "u.i32"),
                     (Int16,   "s.i16"),
diff --git a/test/device/intrinsics.jl b/test/device/intrinsics.jl
index c100b2ddd..a849bfda8 100644
--- a/test/device/intrinsics.jl
+++ b/test/device/intrinsics.jl
@@ -1,4 +1,5 @@
 using SpecialFunctions
+using BFloat16s
 using Metal: metal_support
 
 @testset "arguments" begin
@@ -308,8 +309,9 @@ end
 ############################################################################################
 
 @testset "simd intrinsics" begin
-
-@testset "shuffle($typ)" for typ in [Float32, Float16, Int32, UInt32, Int16, UInt16, Int8, UInt8]
+types = [Float32, Float16, Int32, UInt32, Int16, UInt16, Int8, UInt8]
+metal_support() >= v"3.1" && push!(types, BFloat16)
+@testset "shuffle($typ)" for typ in types
     function kernel(a::MtlDeviceVector{T}, b::MtlDeviceVector{T}) where T
         idx = thread_position_in_grid_1d()
         idx_in_simd = thread_index_in_simdgroup()
@@ -344,7 +346,9 @@ end
 end
 
 @testset "matrix functions" begin
-    @testset "load_store($typ)" for typ in [Float16, Float32]
+    simdgroup_types = [Float16, Float32]
+    metal_support() >= v"3.1" && push!(simdgroup_types, BFloat16)
+    @testset "load_store($typ)" for typ in simdgroup_types
         function kernel(a::MtlDeviceArray{T}, b::MtlDeviceArray{T},
                             origin_a=(1, 1), origin_b=(1, 1)) where {T}
             sg_a = simdgroup_load(a, origin_a)
@@ -367,7 +371,7 @@ end
         end
     end
 
-    @testset "load_store_tg($typ)" for typ in [Float16, Float32]
+    @testset "load_store_tg($typ)" for typ in simdgroup_types
         function kernel(a::MtlDeviceArray{T}, b::MtlDeviceArray{T}) where {T}
             pos = thread_position_in_threadgroup_2d()
 
@@ -391,7 +395,7 @@ end
         @test Array(a) == Array(b)
     end
 
-    @testset "mul($typ)" for typ in [Float16, Float32]
+    @testset "mul($typ)" for typ in simdgroup_types
         function kernel(a::MtlDeviceArray{T}, b::MtlDeviceArray{T}, c::MtlDeviceArray{T}) where {T}
             sg_a = simdgroup_load(a)
             sg_b = simdgroup_load(b)
@@ -407,7 +411,7 @@ end
         @test Array(a) * Array(b) ≈ Array(c)
     end
 
-    @testset "mad($typ)" for typ in [Float16, Float32]
+    @testset "mad($typ)" for typ in simdgroup_types
         function kernel(a::MtlDeviceArray{T}, b::MtlDeviceArray{T}, c::MtlDeviceArray{T},
                     d::MtlDeviceArray{T}) where {T}
             sg_a = simdgroup_load(a)

From 78d0f358e31d75457e2ecbb142182ca6706caedb Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 1 Oct 2024 11:50:58 -0300
Subject: [PATCH 3/5] Add tests

---
 test/array.jl             | 15 ++++++++------
 test/device/intrinsics.jl |  1 +
 test/mps/linalg.jl        | 42 ++++++++++++++++++++-------------------
 test/runtests.jl          | 15 +++++++++++++-
 test/setup.jl             | 10 ++--------
 5 files changed, 48 insertions(+), 35 deletions(-)

diff --git a/test/array.jl b/test/array.jl
index e684a6d18..bf6f9f5e6 100644
--- a/test/array.jl
+++ b/test/array.jl
@@ -1,5 +1,9 @@
 STORAGEMODES = [Metal.PrivateStorage, Metal.SharedStorage, Metal.ManagedStorage]
 
+const FILL_TYPES = [Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64,
+                    Float16, Float32]
+Metal.metal_support() >= v"3.1" && push!(FILL_TYPES, BFloat16)
+
 @testset "array" begin
 
 let arr = MtlVector{Int}(undef, 1)
@@ -27,8 +31,7 @@ end
     @test mtl(1:3) === 1:3
 
 
-    # Page 22 of https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
-    # Only bfloat missing
+    # Section 2.1 of https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
     supported_number_types = [Float16  => Float16,
                               Float32  => Float32,
                               Float64  => Float32,
@@ -41,6 +44,8 @@ end
                               UInt32   => UInt32,
                               UInt64   => UInt64,
                               UInt8    => UInt8]
+    Metal.metal_support() >= v"3.1" && push!(supported_number_types, BFloat16 => BFloat16)
+
     # Test supported types and ensure only Float64 get converted to Float32
     for (SrcType, TargType) in supported_number_types
         @test mtl(SrcType[1]) isa MtlArray{TargType}
@@ -227,8 +232,7 @@ end
 
 end
 
-@testset "fill($T)" for T in [Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64,
-                              Float16, Float32]
+@testset "fill($T)" for T in FILL_TYPES
 
     b = rand(T)
 
@@ -265,8 +269,7 @@ end
     end
 end
 
-@testset "fill!($T)" for T in [Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64,
-                               Float16, Float32]
+@testset "fill!($T)" for T in FILL_TYPES
 
     b = rand(T)
 
diff --git a/test/device/intrinsics.jl b/test/device/intrinsics.jl
index a849bfda8..3b5155b7b 100644
--- a/test/device/intrinsics.jl
+++ b/test/device/intrinsics.jl
@@ -276,6 +276,7 @@ end
 
 @testset "parametrically typed" begin
     typs = [Int32, Int64, Float32]
+    metal_support() >= v"3.1" && push!(types, BFloat16)
     @testset for typ in typs
         function kernel(d::MtlDeviceArray{T}, n) where {T}
             t = thread_position_in_threadgroup_1d()
diff --git a/test/mps/linalg.jl b/test/mps/linalg.jl
index 106d7669c..f8e9a9dc0 100644
--- a/test/mps/linalg.jl
+++ b/test/mps/linalg.jl
@@ -147,33 +147,36 @@ function cpu_topk(x::Matrix{T}, k; rev=true, dims=1) where {T}
 end
 
 @testset "topk & topk!" begin
-    for ftype in (Float16, Float32)
+    ftypes = [Float16, Float32]
+
+    @testset "$ftype" for ftype in ftypes
         # Normal operation
-        @testset "$ftype" begin
-            for (shp,k) in [((3,1), 2), ((20,30), 5)]
-                cpu_a = rand(ftype, shp...)
+        @testset "$shp, k=$k" for (shp,k) in [((3,1), 2), ((20,30), 5)]
+            cpu_a = rand(ftype, shp...)
 
-                #topk
-                cpu_i, cpu_v = cpu_topk(cpu_a, k)
+            #topk
+            cpu_i, cpu_v = cpu_topk(cpu_a, k)
 
-                a = MtlMatrix(cpu_a)
-                i, v = MPS.topk(a, k)
+            a = MtlMatrix(cpu_a)
+            i, v = MPS.topk(a, k)
 
-                @test Array(i) == cpu_i
-                @test Array(v) == cpu_v
+            @test Array(i) == cpu_i
+            @test Array(v) == cpu_v
 
-                #topk!
-                i = MtlMatrix{UInt32}(undef, (k, shp[2]))
-                v = MtlMatrix{ftype}(undef, (k, shp[2]))
+            #topk!
+            i = MtlMatrix{UInt32}(undef, (k, shp[2]))
+            v = MtlMatrix{ftype}(undef, (k, shp[2]))
 
-                i, v = MPS.topk!(a, i, v, k)
+            i, v = MPS.topk!(a, i, v, k)
 
-                @test Array(i) == cpu_i
-                @test Array(v) == cpu_v
-            end
-            shp = (20,30)
-            k = 17
+            @test Array(i) == cpu_i
+            @test Array(v) == cpu_v
+        end
 
+        # test too big `k`
+        shp = (20,30)
+        k = 17
+        @testset "$shp, k=$k" begin
             cpu_a = rand(ftype, shp...)
             cpu_i, cpu_v = cpu_topk(cpu_a, k)
 
@@ -185,7 +188,6 @@ end
             v = MtlMatrix{ftype}(undef, (k, shp[2]))
 
             @test_throws "MPSMatrixFindTopK does not support values of k > 16" i, v = MPS.topk!(a, i, v, k)
-
         end
     end
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 187b5b9b8..0554e8689 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -73,14 +73,27 @@ for (rootpath, dirs, files) in walkdir(@__DIR__)
     test_runners[file] = ()->include("$(@__DIR__)/$file.jl")
   end
 end
+
 ## GPUArrays testsuite
+const gpuarr_eltypes = [Int16, Int32, Int64,
+                        Complex{Int16}, Complex{Int32}, Complex{Int64},
+                        Float16, Float32,
+                        ComplexF16, ComplexF32]
+const gpuarr_eltypes_nobf16 = copy(gpuarr_eltypes)
+
+# Add BFloat16 for tests that use it
+Metal.metal_support() >= v"3.1" && push!(gpuarr_eltypes, BFloat16)
+
 for name in keys(TestSuite.tests)
     if Metal.DefaultStorageMode != Metal.PrivateStorage && name == "indexing scalar"
         # GPUArrays' scalar indexing tests assume that indexing is not supported
         continue
     end
+
+    tmp_eltypes = name in ["random"] ? gpuarr_eltypes_nobf16 : gpuarr_eltypes
+
     push!(tests, "gpuarrays$(Base.Filesystem.path_separator)$name")
-    test_runners["gpuarrays$(Base.Filesystem.path_separator)$name"] = ()->TestSuite.tests[name](MtlArray)
+    test_runners["gpuarrays$(Base.Filesystem.path_separator)$name"] = ()->TestSuite.tests[name](MtlArray;eltypes=tmp_eltypes)
 end
 unique!(tests)
 
diff --git a/test/setup.jl b/test/setup.jl
index 4694234c9..524af2e32 100644
--- a/test/setup.jl
+++ b/test/setup.jl
@@ -1,4 +1,4 @@
-using Distributed, Test, Metal, Adapt, ObjectiveC, ObjectiveC.Foundation
+using Distributed, Test, Metal, BFloat16s, Adapt, ObjectiveC, ObjectiveC.Foundation
 
 Metal.functional() || error("Metal.jl is not functional on this system")
 
@@ -10,12 +10,6 @@ gpuarrays_root = dirname(dirname(gpuarrays))
 include(joinpath(gpuarrays_root, "test", "testsuite.jl"))
 testf(f, xs...; kwargs...) = TestSuite.compare(f, MtlArray, xs...; kwargs...)
 
-const eltypes = [Int16, Int32, Int64,
-                 Complex{Int16}, Complex{Int32}, Complex{Int64},
-                 Float16, Float32,
-                 ComplexF16, ComplexF32]
-TestSuite.supported_eltypes(::Type{<:MtlArray}) = eltypes
-
 const runtime_validation = get(ENV, "MTL_DEBUG_LAYER", "0") != "0"
 const shader_validation  = get(ENV, "MTL_SHADER_VALIDATION", "0") != "0"
 
@@ -32,7 +26,7 @@ function runtests(f, name)
         # generate a temporary module to execute the tests in
         mod_name = Symbol("Test", rand(1:100), "Main_", replace(name, '/' => '_'))
         mod = @eval(Main, module $mod_name end)
-        @eval(mod, using Test, Random, Metal)
+        @eval(mod, using Test, Random, Metal, BFloat16s)
 
         let id = myid()
             wait(@spawnat 1 print_testworker_started(name, id))

From f86d5fbcada95d10be66eddcbd7ec6a27798712a Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 1 Oct 2024 17:46:53 -0300
Subject: [PATCH 4/5] More bfloat support and test fixes

---
 src/device/intrinsics/math.jl | 4 ++--
 src/device/intrinsics/simd.jl | 2 +-
 test/device/intrinsics.jl     | 9 +++++----
 test/runtests.jl              | 8 +++++++-
 4 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/device/intrinsics/math.jl b/src/device/intrinsics/math.jl
index e7544c1f0..125f41bf8 100644
--- a/src/device/intrinsics/math.jl
+++ b/src/device/intrinsics/math.jl
@@ -418,7 +418,7 @@ end
     j = fma(1.442695f0, a, 12582912.0f0)
     j = j - 12582912.0f0
     i = unsafe_trunc(Int32, j)
-    f = fma(j, -6.93145752f-1, a) # log_2_hi 
+    f = fma(j, -6.93145752f-1, a) # log_2_hi
     f = fma(j, -1.42860677f-6, f) # log_2_lo
 
     # approximate r = exp(f)-1 on interval [-log(2)/2, +log(2)/2]
@@ -460,4 +460,4 @@ end
     end
 
     return r
-end
\ No newline at end of file
+end
diff --git a/src/device/intrinsics/simd.jl b/src/device/intrinsics/simd.jl
index e8815797d..8d83e92bd 100644
--- a/src/device/intrinsics/simd.jl
+++ b/src/device/intrinsics/simd.jl
@@ -7,7 +7,7 @@ function convert_origin(origin::NTuple{2, Int64})
     return (VecElement{Int64}(origin[1]-1), VecElement{Int64}(origin[2]-1))
 end
 
-for (jltype, suffix) in ((:Float16, "f16"), (:Float32, "f32"), (:BFloat16, "bf18"))
+for (jltype, suffix) in ((:Float16, "f16"), (:Float32, "f32"), (:BFloat16, "bf16"))
     for as in (AS.Device, AS.ThreadGroup)
         @eval begin
             @device_function simdgroup_load(
diff --git a/test/device/intrinsics.jl b/test/device/intrinsics.jl
index 3b5155b7b..a90812242 100644
--- a/test/device/intrinsics.jl
+++ b/test/device/intrinsics.jl
@@ -275,9 +275,9 @@ end
 end
 
 @testset "parametrically typed" begin
-    typs = [Int32, Int64, Float32]
+    types = [Int32, Int64, Float32]
     metal_support() >= v"3.1" && push!(types, BFloat16)
-    @testset for typ in typs
+    @testset for typ in types
         function kernel(d::MtlDeviceArray{T}, n) where {T}
             t = thread_position_in_threadgroup_1d()
             tr = n-t+1
@@ -405,8 +405,9 @@ end
             return
         end
 
-        a = MtlArray(rand(typ, 8, 8))
-        b = MtlArray(rand(typ, 8, 8))
+        #Use `ones` for figuring out issues
+        a = MtlArray(ones(typ, 8, 8))
+        b = MtlArray(ones(typ, 8, 8))
         c = MtlArray(zeros(typ, 8, 8))
         @metal threads=(8, 8) kernel(a, b, c)
         @test Array(a) * Array(b) ≈ Array(c)
diff --git a/test/runtests.jl b/test/runtests.jl
index 0554e8689..584dc3183 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -81,6 +81,12 @@ const gpuarr_eltypes = [Int16, Int32, Int64,
                         ComplexF16, ComplexF32]
 const gpuarr_eltypes_nobf16 = copy(gpuarr_eltypes)
 
+# don't test BFloat16 for unsupported operations
+nobf16_tests = ["random", "reductions/reducedim!",
+        "reductions/mapreducedim!_large", "reductions/mapreduce",
+        "reductions/== isequal", "reductions/minimum maximum extrema",
+        "reductions/sum prod", "reductions/mapreducedim!", "reductions/reduce"]
+
 # Add BFloat16 for tests that use it
 Metal.metal_support() >= v"3.1" && push!(gpuarr_eltypes, BFloat16)
 
@@ -90,7 +96,7 @@ for name in keys(TestSuite.tests)
         continue
     end
 
-    tmp_eltypes = name in ["random"] ? gpuarr_eltypes_nobf16 : gpuarr_eltypes
+    tmp_eltypes = name in nobf16_tests ? gpuarr_eltypes_nobf16 : gpuarr_eltypes
 
     push!(tests, "gpuarrays$(Base.Filesystem.path_separator)$name")
     test_runners["gpuarrays$(Base.Filesystem.path_separator)$name"] = ()->TestSuite.tests[name](MtlArray;eltypes=tmp_eltypes)

From caf8f9de7849683c097310787eb379474cec5452 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Sat, 12 Oct 2024 14:40:18 -0300
Subject: [PATCH 5/5] Tweak docstrings [skip tests]

---
 src/device/intrinsics/simd.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/device/intrinsics/simd.jl b/src/device/intrinsics/simd.jl
index 8d83e92bd..45e182508 100644
--- a/src/device/intrinsics/simd.jl
+++ b/src/device/intrinsics/simd.jl
@@ -55,7 +55,7 @@ end
     simdgroup_load(data::MtlDeviceArray{T}, matrix_origin=(1, 1))
 
 Loads data from device or threadgroup memory into an 8x8 SIMD-group matrix
-and returns it. `T` must be either `Float16` or `Float32`.
+and returns it. `T` must be either `Float16`, `Float32`, or `BFloat16`.
 
 # Arguments
 - `matrix_origin::NTuple{2, Int64}=(1, 1)`: origin in the source memory to load from.
@@ -65,7 +65,7 @@ and returns it. `T` must be either `Float16` or `Float32`.
     simdgroup_store(src, dest::MtlDeviceArray{T}, matrix_origin=(1, 1))
 
 Stores data from an 8x8 SIMD-group matrix into device or threadgroup memory.
-`T` must be either `Float16` or `Float32`.
+`T` must be either `Float16`, `Float32`, `BFloat16`.
 
 # Arguments
 - `matrix_origin::NTuple{2, Int64}=(1, 1)`: origin in the destination memory to store to.
@@ -119,7 +119,7 @@ The value for delta must be the same for all threads in the SIMD-group. This fun
 doesn’t modify the upper delta lanes of data because it doesn’t wrap values around
 the SIMD-group.
 
-T must be one of the following: Float32, Float16, Int32, UInt32, Int16, UInt16, Int8, or UInt8
+T must be one of the following: Float32, Float16, BFloat16, Int32, UInt32, Int16, UInt16, Int8, or UInt8
 """
 simd_shuffle_down
 
@@ -132,6 +132,6 @@ lane ID minus delta.
 The value of delta must be the same for all threads in a SIMD-group. This function doesn’t
 modify the lower delta lanes of data because it doesn’t wrap values around the SIMD-group.
 
-T must be one of the following: Float32, Float16, Int32, UInt32, Int16, UInt16, Int8, or UInt8
+T must be one of the following: Float32, Float16, BFloat16, Int32, UInt32, Int16, UInt16, Int8, or UInt8
 """
 simd_shuffle_up