From 63379de799960ec2a0af62771c0d82adadda6ae8 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 26 Sep 2024 10:38:08 -0300 Subject: [PATCH 1/5] Move BFloat16 code out of extension --- Project.toml | 3 +-- ext/BFloat16sExt.jl | 14 -------------- lib/mps/MPS.jl | 2 ++ lib/mps/matrix.jl | 2 +- 4 files changed, 4 insertions(+), 17 deletions(-) delete mode 100644 ext/BFloat16sExt.jl diff --git a/Project.toml b/Project.toml index 4e7f1bd88..90cb06fb4 100644 --- a/Project.toml +++ b/Project.toml @@ -5,6 +5,7 @@ version = "1.4.0" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" Artifacts = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" +BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82" CodecBzip2 = "523fee87-0ab8-5b00-afb7-3ecf72e48cfd" ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04" @@ -26,11 +27,9 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" [weakdeps] -BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" [extensions] -BFloat16sExt = "BFloat16s" SpecialFunctionsExt = "SpecialFunctions" [compat] diff --git a/ext/BFloat16sExt.jl b/ext/BFloat16sExt.jl deleted file mode 100644 index 6f11b18fc..000000000 --- a/ext/BFloat16sExt.jl +++ /dev/null @@ -1,14 +0,0 @@ -module BFloat16sExt - -using Metal: MPS.MPSDataType, MPS.MPSDataTypeBFloat16, MPS.jl_mps_to_typ, macos_version -using BFloat16s - -# BFloat is only supported in MPS starting in MacOS 14 -@static if Sys.isapple() - if macos_version() >= v"14" - Base.convert(::Type{MPSDataType}, ::Type{BFloat16}) = MPSDataTypeBFloat16 - jl_mps_to_typ[MPSDataTypeBFloat16] = BFloat16 - end -end - -end # module diff --git a/lib/mps/MPS.jl b/lib/mps/MPS.jl index 7266eae9a..09f7754b7 100644 --- a/lib/mps/MPS.jl +++ b/lib/mps/MPS.jl @@ -16,6 +16,8 @@ using ObjectiveC, .Foundation import GPUArrays +using BFloat16s + const MtlFloat = Union{Float32, Float16} is_supported(dev::MTLDevice) = ccall(:MPSSupportsMTLDevice, Bool, (id{MTLDevice},), dev) diff --git a/lib/mps/matrix.jl b/lib/mps/matrix.jl index 9eb86d75b..295d63c53 100644 --- a/lib/mps/matrix.jl +++ b/lib/mps/matrix.jl @@ -38,7 +38,7 @@ Base.convert(::Type{MPSDataType}, x::Integer) = MPSDataType(x) # Conversions for MPSDataTypes with Julia equivalents const jl_mps_to_typ = Dict{MPSDataType, DataType}() -for type in [UInt8,UInt16,UInt32,UInt64,Int8,Int16,Int32,Int64,Float16,Float32,ComplexF16,ComplexF32,Bool] +for type in [:UInt8,:UInt16,:UInt32,:UInt64,:Int8,:Int16,:Int32,:Int64,:Float16,:BFloat16,:Float32,:ComplexF16,:ComplexF32,:Bool] @eval Base.convert(::Type{MPSDataType}, ::Type{$type}) = $(Symbol(:MPSDataType, type)) @eval jl_mps_to_typ[$(Symbol(:MPSDataType, type))] = $type end From 2b1e7a27ea84a45cecc3460ed1a3ae894dc915a4 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 26 Sep 2024 12:05:38 -0300 Subject: [PATCH 2/5] Initial BFloat16 support --- lib/mps/MPS.jl | 4 ++-- src/Metal.jl | 1 + src/compiler/compilation.jl | 3 ++- src/device/intrinsics/simd.jl | 3 ++- test/device/intrinsics.jl | 16 ++++++++++------ 5 files changed, 17 insertions(+), 10 deletions(-) diff --git a/lib/mps/MPS.jl b/lib/mps/MPS.jl index 09f7754b7..b2874e26d 100644 --- a/lib/mps/MPS.jl +++ b/lib/mps/MPS.jl @@ -16,9 +16,9 @@ using ObjectiveC, .Foundation import GPUArrays -using BFloat16s +using BFloat16s: BFloat16 -const MtlFloat = Union{Float32, Float16} +const MtlFloat = Union{Float32, Float16, BFloat16} is_supported(dev::MTLDevice) = ccall(:MPSSupportsMTLDevice, Bool, (id{MTLDevice},), dev) diff --git a/src/Metal.jl b/src/Metal.jl index b63a69910..1a51456c2 100644 --- a/src/Metal.jl +++ b/src/Metal.jl @@ -13,6 +13,7 @@ using ExprTools: splitdef, combinedef using Artifacts using ObjectiveC, .CoreFoundation, .Foundation, .Dispatch, .OS import KernelAbstractions +using BFloat16s include("version.jl") diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl index 7c4284ca8..c8fede871 100644 --- a/src/compiler/compilation.jl +++ b/src/compiler/compilation.jl @@ -18,7 +18,8 @@ function GPUCompiler.finish_ir!(@nospecialize(job::MetalCompilerJob), # pointer type information for typed intrinsics # (this is consumed by the LLVM IR downgrader) for (jltyp, llvmtyp) in (Int32 => :i32, Int64 => :i64, - Float16 => :f16, Float32 => :f32), + Float16 => :f16, Float32 => :f32, + BFloat16 => :bf16), (as, asname) in (AS.Device => "global", AS.ThreadGroup => "local") # map of intrinsics to pointer operand indices and eltypes diff --git a/src/device/intrinsics/simd.jl b/src/device/intrinsics/simd.jl index 79250e330..e8815797d 100644 --- a/src/device/intrinsics/simd.jl +++ b/src/device/intrinsics/simd.jl @@ -7,7 +7,7 @@ function convert_origin(origin::NTuple{2, Int64}) return (VecElement{Int64}(origin[1]-1), VecElement{Int64}(origin[2]-1)) end -for (jltype, suffix) in ((:Float16, "f16"), (:Float32, "f32")) +for (jltype, suffix) in ((:Float16, "f16"), (:Float32, "f32"), (:BFloat16, "bf18")) for as in (AS.Device, AS.ThreadGroup) @eval begin @device_function simdgroup_load( @@ -88,6 +88,7 @@ Returns `a * b + c`. simd_shuffle_map = ((Float32, "f32"), (Float16, "f16"), + (BFloat16, "bf16"), (Int32, "s.i32"), (UInt32, "u.i32"), (Int16, "s.i16"), diff --git a/test/device/intrinsics.jl b/test/device/intrinsics.jl index c100b2ddd..a849bfda8 100644 --- a/test/device/intrinsics.jl +++ b/test/device/intrinsics.jl @@ -1,4 +1,5 @@ using SpecialFunctions +using BFloat16s using Metal: metal_support @testset "arguments" begin @@ -308,8 +309,9 @@ end ############################################################################################ @testset "simd intrinsics" begin - -@testset "shuffle($typ)" for typ in [Float32, Float16, Int32, UInt32, Int16, UInt16, Int8, UInt8] +types = [Float32, Float16, Int32, UInt32, Int16, UInt16, Int8, UInt8] +metal_support() >= v"3.1" && push!(types, BFloat16) +@testset "shuffle($typ)" for typ in types function kernel(a::MtlDeviceVector{T}, b::MtlDeviceVector{T}) where T idx = thread_position_in_grid_1d() idx_in_simd = thread_index_in_simdgroup() @@ -344,7 +346,9 @@ end end @testset "matrix functions" begin - @testset "load_store($typ)" for typ in [Float16, Float32] + simdgroup_types = [Float16, Float32] + metal_support() >= v"3.1" && push!(simdgroup_types, BFloat16) + @testset "load_store($typ)" for typ in simdgroup_types function kernel(a::MtlDeviceArray{T}, b::MtlDeviceArray{T}, origin_a=(1, 1), origin_b=(1, 1)) where {T} sg_a = simdgroup_load(a, origin_a) @@ -367,7 +371,7 @@ end end end - @testset "load_store_tg($typ)" for typ in [Float16, Float32] + @testset "load_store_tg($typ)" for typ in simdgroup_types function kernel(a::MtlDeviceArray{T}, b::MtlDeviceArray{T}) where {T} pos = thread_position_in_threadgroup_2d() @@ -391,7 +395,7 @@ end @test Array(a) == Array(b) end - @testset "mul($typ)" for typ in [Float16, Float32] + @testset "mul($typ)" for typ in simdgroup_types function kernel(a::MtlDeviceArray{T}, b::MtlDeviceArray{T}, c::MtlDeviceArray{T}) where {T} sg_a = simdgroup_load(a) sg_b = simdgroup_load(b) @@ -407,7 +411,7 @@ end @test Array(a) * Array(b) ≈ Array(c) end - @testset "mad($typ)" for typ in [Float16, Float32] + @testset "mad($typ)" for typ in simdgroup_types function kernel(a::MtlDeviceArray{T}, b::MtlDeviceArray{T}, c::MtlDeviceArray{T}, d::MtlDeviceArray{T}) where {T} sg_a = simdgroup_load(a) From 78d0f358e31d75457e2ecbb142182ca6706caedb Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 1 Oct 2024 11:50:58 -0300 Subject: [PATCH 3/5] Add tests --- test/array.jl | 15 ++++++++------ test/device/intrinsics.jl | 1 + test/mps/linalg.jl | 42 ++++++++++++++++++++------------------- test/runtests.jl | 15 +++++++++++++- test/setup.jl | 10 ++-------- 5 files changed, 48 insertions(+), 35 deletions(-) diff --git a/test/array.jl b/test/array.jl index e684a6d18..bf6f9f5e6 100644 --- a/test/array.jl +++ b/test/array.jl @@ -1,5 +1,9 @@ STORAGEMODES = [Metal.PrivateStorage, Metal.SharedStorage, Metal.ManagedStorage] +const FILL_TYPES = [Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, + Float16, Float32] +Metal.metal_support() >= v"3.1" && push!(FILL_TYPES, BFloat16) + @testset "array" begin let arr = MtlVector{Int}(undef, 1) @@ -27,8 +31,7 @@ end @test mtl(1:3) === 1:3 - # Page 22 of https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf - # Only bfloat missing + # Section 2.1 of https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf supported_number_types = [Float16 => Float16, Float32 => Float32, Float64 => Float32, @@ -41,6 +44,8 @@ end UInt32 => UInt32, UInt64 => UInt64, UInt8 => UInt8] + Metal.metal_support() >= v"3.1" && push!(supported_number_types, BFloat16 => BFloat16) + # Test supported types and ensure only Float64 get converted to Float32 for (SrcType, TargType) in supported_number_types @test mtl(SrcType[1]) isa MtlArray{TargType} @@ -227,8 +232,7 @@ end end -@testset "fill($T)" for T in [Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, - Float16, Float32] +@testset "fill($T)" for T in FILL_TYPES b = rand(T) @@ -265,8 +269,7 @@ end end end -@testset "fill!($T)" for T in [Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, - Float16, Float32] +@testset "fill!($T)" for T in FILL_TYPES b = rand(T) diff --git a/test/device/intrinsics.jl b/test/device/intrinsics.jl index a849bfda8..3b5155b7b 100644 --- a/test/device/intrinsics.jl +++ b/test/device/intrinsics.jl @@ -276,6 +276,7 @@ end @testset "parametrically typed" begin typs = [Int32, Int64, Float32] + metal_support() >= v"3.1" && push!(types, BFloat16) @testset for typ in typs function kernel(d::MtlDeviceArray{T}, n) where {T} t = thread_position_in_threadgroup_1d() diff --git a/test/mps/linalg.jl b/test/mps/linalg.jl index 106d7669c..f8e9a9dc0 100644 --- a/test/mps/linalg.jl +++ b/test/mps/linalg.jl @@ -147,33 +147,36 @@ function cpu_topk(x::Matrix{T}, k; rev=true, dims=1) where {T} end @testset "topk & topk!" begin - for ftype in (Float16, Float32) + ftypes = [Float16, Float32] + + @testset "$ftype" for ftype in ftypes # Normal operation - @testset "$ftype" begin - for (shp,k) in [((3,1), 2), ((20,30), 5)] - cpu_a = rand(ftype, shp...) + @testset "$shp, k=$k" for (shp,k) in [((3,1), 2), ((20,30), 5)] + cpu_a = rand(ftype, shp...) - #topk - cpu_i, cpu_v = cpu_topk(cpu_a, k) + #topk + cpu_i, cpu_v = cpu_topk(cpu_a, k) - a = MtlMatrix(cpu_a) - i, v = MPS.topk(a, k) + a = MtlMatrix(cpu_a) + i, v = MPS.topk(a, k) - @test Array(i) == cpu_i - @test Array(v) == cpu_v + @test Array(i) == cpu_i + @test Array(v) == cpu_v - #topk! - i = MtlMatrix{UInt32}(undef, (k, shp[2])) - v = MtlMatrix{ftype}(undef, (k, shp[2])) + #topk! + i = MtlMatrix{UInt32}(undef, (k, shp[2])) + v = MtlMatrix{ftype}(undef, (k, shp[2])) - i, v = MPS.topk!(a, i, v, k) + i, v = MPS.topk!(a, i, v, k) - @test Array(i) == cpu_i - @test Array(v) == cpu_v - end - shp = (20,30) - k = 17 + @test Array(i) == cpu_i + @test Array(v) == cpu_v + end + # test too big `k` + shp = (20,30) + k = 17 + @testset "$shp, k=$k" begin cpu_a = rand(ftype, shp...) cpu_i, cpu_v = cpu_topk(cpu_a, k) @@ -185,7 +188,6 @@ end v = MtlMatrix{ftype}(undef, (k, shp[2])) @test_throws "MPSMatrixFindTopK does not support values of k > 16" i, v = MPS.topk!(a, i, v, k) - end end end diff --git a/test/runtests.jl b/test/runtests.jl index 187b5b9b8..0554e8689 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -73,14 +73,27 @@ for (rootpath, dirs, files) in walkdir(@__DIR__) test_runners[file] = ()->include("$(@__DIR__)/$file.jl") end end + ## GPUArrays testsuite +const gpuarr_eltypes = [Int16, Int32, Int64, + Complex{Int16}, Complex{Int32}, Complex{Int64}, + Float16, Float32, + ComplexF16, ComplexF32] +const gpuarr_eltypes_nobf16 = copy(gpuarr_eltypes) + +# Add BFloat16 for tests that use it +Metal.metal_support() >= v"3.1" && push!(gpuarr_eltypes, BFloat16) + for name in keys(TestSuite.tests) if Metal.DefaultStorageMode != Metal.PrivateStorage && name == "indexing scalar" # GPUArrays' scalar indexing tests assume that indexing is not supported continue end + + tmp_eltypes = name in ["random"] ? gpuarr_eltypes_nobf16 : gpuarr_eltypes + push!(tests, "gpuarrays$(Base.Filesystem.path_separator)$name") - test_runners["gpuarrays$(Base.Filesystem.path_separator)$name"] = ()->TestSuite.tests[name](MtlArray) + test_runners["gpuarrays$(Base.Filesystem.path_separator)$name"] = ()->TestSuite.tests[name](MtlArray;eltypes=tmp_eltypes) end unique!(tests) diff --git a/test/setup.jl b/test/setup.jl index 4694234c9..524af2e32 100644 --- a/test/setup.jl +++ b/test/setup.jl @@ -1,4 +1,4 @@ -using Distributed, Test, Metal, Adapt, ObjectiveC, ObjectiveC.Foundation +using Distributed, Test, Metal, BFloat16s, Adapt, ObjectiveC, ObjectiveC.Foundation Metal.functional() || error("Metal.jl is not functional on this system") @@ -10,12 +10,6 @@ gpuarrays_root = dirname(dirname(gpuarrays)) include(joinpath(gpuarrays_root, "test", "testsuite.jl")) testf(f, xs...; kwargs...) = TestSuite.compare(f, MtlArray, xs...; kwargs...) -const eltypes = [Int16, Int32, Int64, - Complex{Int16}, Complex{Int32}, Complex{Int64}, - Float16, Float32, - ComplexF16, ComplexF32] -TestSuite.supported_eltypes(::Type{<:MtlArray}) = eltypes - const runtime_validation = get(ENV, "MTL_DEBUG_LAYER", "0") != "0" const shader_validation = get(ENV, "MTL_SHADER_VALIDATION", "0") != "0" @@ -32,7 +26,7 @@ function runtests(f, name) # generate a temporary module to execute the tests in mod_name = Symbol("Test", rand(1:100), "Main_", replace(name, '/' => '_')) mod = @eval(Main, module $mod_name end) - @eval(mod, using Test, Random, Metal) + @eval(mod, using Test, Random, Metal, BFloat16s) let id = myid() wait(@spawnat 1 print_testworker_started(name, id)) From f86d5fbcada95d10be66eddcbd7ec6a27798712a Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 1 Oct 2024 17:46:53 -0300 Subject: [PATCH 4/5] More bfloat support and test fixes --- src/device/intrinsics/math.jl | 4 ++-- src/device/intrinsics/simd.jl | 2 +- test/device/intrinsics.jl | 9 +++++---- test/runtests.jl | 8 +++++++- 4 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/device/intrinsics/math.jl b/src/device/intrinsics/math.jl index e7544c1f0..125f41bf8 100644 --- a/src/device/intrinsics/math.jl +++ b/src/device/intrinsics/math.jl @@ -418,7 +418,7 @@ end j = fma(1.442695f0, a, 12582912.0f0) j = j - 12582912.0f0 i = unsafe_trunc(Int32, j) - f = fma(j, -6.93145752f-1, a) # log_2_hi + f = fma(j, -6.93145752f-1, a) # log_2_hi f = fma(j, -1.42860677f-6, f) # log_2_lo # approximate r = exp(f)-1 on interval [-log(2)/2, +log(2)/2] @@ -460,4 +460,4 @@ end end return r -end \ No newline at end of file +end diff --git a/src/device/intrinsics/simd.jl b/src/device/intrinsics/simd.jl index e8815797d..8d83e92bd 100644 --- a/src/device/intrinsics/simd.jl +++ b/src/device/intrinsics/simd.jl @@ -7,7 +7,7 @@ function convert_origin(origin::NTuple{2, Int64}) return (VecElement{Int64}(origin[1]-1), VecElement{Int64}(origin[2]-1)) end -for (jltype, suffix) in ((:Float16, "f16"), (:Float32, "f32"), (:BFloat16, "bf18")) +for (jltype, suffix) in ((:Float16, "f16"), (:Float32, "f32"), (:BFloat16, "bf16")) for as in (AS.Device, AS.ThreadGroup) @eval begin @device_function simdgroup_load( diff --git a/test/device/intrinsics.jl b/test/device/intrinsics.jl index 3b5155b7b..a90812242 100644 --- a/test/device/intrinsics.jl +++ b/test/device/intrinsics.jl @@ -275,9 +275,9 @@ end end @testset "parametrically typed" begin - typs = [Int32, Int64, Float32] + types = [Int32, Int64, Float32] metal_support() >= v"3.1" && push!(types, BFloat16) - @testset for typ in typs + @testset for typ in types function kernel(d::MtlDeviceArray{T}, n) where {T} t = thread_position_in_threadgroup_1d() tr = n-t+1 @@ -405,8 +405,9 @@ end return end - a = MtlArray(rand(typ, 8, 8)) - b = MtlArray(rand(typ, 8, 8)) + #Use `ones` for figuring out issues + a = MtlArray(ones(typ, 8, 8)) + b = MtlArray(ones(typ, 8, 8)) c = MtlArray(zeros(typ, 8, 8)) @metal threads=(8, 8) kernel(a, b, c) @test Array(a) * Array(b) ≈ Array(c) diff --git a/test/runtests.jl b/test/runtests.jl index 0554e8689..584dc3183 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -81,6 +81,12 @@ const gpuarr_eltypes = [Int16, Int32, Int64, ComplexF16, ComplexF32] const gpuarr_eltypes_nobf16 = copy(gpuarr_eltypes) +# don't test BFloat16 for unsupported operations +nobf16_tests = ["random", "reductions/reducedim!", + "reductions/mapreducedim!_large", "reductions/mapreduce", + "reductions/== isequal", "reductions/minimum maximum extrema", + "reductions/sum prod", "reductions/mapreducedim!", "reductions/reduce"] + # Add BFloat16 for tests that use it Metal.metal_support() >= v"3.1" && push!(gpuarr_eltypes, BFloat16) @@ -90,7 +96,7 @@ for name in keys(TestSuite.tests) continue end - tmp_eltypes = name in ["random"] ? gpuarr_eltypes_nobf16 : gpuarr_eltypes + tmp_eltypes = name in nobf16_tests ? gpuarr_eltypes_nobf16 : gpuarr_eltypes push!(tests, "gpuarrays$(Base.Filesystem.path_separator)$name") test_runners["gpuarrays$(Base.Filesystem.path_separator)$name"] = ()->TestSuite.tests[name](MtlArray;eltypes=tmp_eltypes) From caf8f9de7849683c097310787eb379474cec5452 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Sat, 12 Oct 2024 14:40:18 -0300 Subject: [PATCH 5/5] Tweak docstrings [skip tests] --- src/device/intrinsics/simd.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/device/intrinsics/simd.jl b/src/device/intrinsics/simd.jl index 8d83e92bd..45e182508 100644 --- a/src/device/intrinsics/simd.jl +++ b/src/device/intrinsics/simd.jl @@ -55,7 +55,7 @@ end simdgroup_load(data::MtlDeviceArray{T}, matrix_origin=(1, 1)) Loads data from device or threadgroup memory into an 8x8 SIMD-group matrix -and returns it. `T` must be either `Float16` or `Float32`. +and returns it. `T` must be either `Float16`, `Float32`, or `BFloat16`. # Arguments - `matrix_origin::NTuple{2, Int64}=(1, 1)`: origin in the source memory to load from. @@ -65,7 +65,7 @@ and returns it. `T` must be either `Float16` or `Float32`. simdgroup_store(src, dest::MtlDeviceArray{T}, matrix_origin=(1, 1)) Stores data from an 8x8 SIMD-group matrix into device or threadgroup memory. -`T` must be either `Float16` or `Float32`. +`T` must be either `Float16`, `Float32`, `BFloat16`. # Arguments - `matrix_origin::NTuple{2, Int64}=(1, 1)`: origin in the destination memory to store to. @@ -119,7 +119,7 @@ The value for delta must be the same for all threads in the SIMD-group. This fun doesn’t modify the upper delta lanes of data because it doesn’t wrap values around the SIMD-group. -T must be one of the following: Float32, Float16, Int32, UInt32, Int16, UInt16, Int8, or UInt8 +T must be one of the following: Float32, Float16, BFloat16, Int32, UInt32, Int16, UInt16, Int8, or UInt8 """ simd_shuffle_down @@ -132,6 +132,6 @@ lane ID minus delta. The value of delta must be the same for all threads in a SIMD-group. This function doesn’t modify the lower delta lanes of data because it doesn’t wrap values around the SIMD-group. -T must be one of the following: Float32, Float16, Int32, UInt32, Int16, UInt16, Int8, or UInt8 +T must be one of the following: Float32, Float16, BFloat16, Int32, UInt32, Int16, UInt16, Int8, or UInt8 """ simd_shuffle_up