-
Notifications
You must be signed in to change notification settings - Fork 38
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Int128 does not compile #287
Comments
I suspect the compiler does not know how to legalize ; ModuleID = 'shader.air'
source_filename = "start"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-n8:16:32"
target triple = "air64-apple-macosx14.3.1"
define void @_Z16broadcast_kernel16mtlKernelContext14MtlDeviceArrayI6Int128Li1ELi1EE11BroadcastedI13MtlArrayStyleILi1E39Metal_MTL_MTLResourceStorageModePrivateE5TupleI5OneToI5Int64EE1_S4_I8ExtrudedIS0_IS1_Li1ELi1EES4_I4BoolES4_IS6_EES8_IS0_IS1_Li1ELi1EES4_IS9_ES4_IS6_EEEES6_({ i8 addrspace(1)*, [1 x i64] } addrspace(1)* %0, { [2 x { { i8 addrspace(1)*, [1 x i64] }, [1 x i8], [1 x i64] }], [1 x [1 x i64]] } addrspace(1)* %1, i64 addrspace(1)* %2, i32 %threads_per_grid, i32 %thread_position_in_grid) local_unnamed_addr {
conversion:
%3 = getelementptr inbounds { i8 addrspace(1)*, [1 x i64] }, { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %0, i64 0, i32 1, i64 0
%.unpack4.unpack = load i64, i64 addrspace(1)* %3, align 8
%.unpack.elt = getelementptr inbounds { [2 x { { i8 addrspace(1)*, [1 x i64] }, [1 x i8], [1 x i64] }], [1 x [1 x i64]] }, { [2 x { { i8 addrspace(1)*, [1 x i64] }, [1 x i8], [1 x i64] }], [1 x [1 x i64]] } addrspace(1)* %1, i64 0, i32 0, i64 0
%.unpack.unpack = load { { i8 addrspace(1)*, [1 x i64] }, [1 x i8], [1 x i64] }, { { i8 addrspace(1)*, [1 x i64] }, [1 x i8], [1 x i64] } addrspace(1)* %.unpack.elt, align 8
%.unpack.elt9 = getelementptr inbounds { [2 x { { i8 addrspace(1)*, [1 x i64] }, [1 x i8], [1 x i64] }], [1 x [1 x i64]] }, { [2 x { { i8 addrspace(1)*, [1 x i64] }, [1 x i8], [1 x i64] }], [1 x [1 x i64]] } addrspace(1)* %1, i64 0, i32 0, i64 1
%.unpack.unpack10 = load { { i8 addrspace(1)*, [1 x i64] }, [1 x i8], [1 x i64] }, { { i8 addrspace(1)*, [1 x i64] }, [1 x i8], [1 x i64] } addrspace(1)* %.unpack.elt9, align 8
%.fca.0.0.2.0.extract = extractvalue { { i8 addrspace(1)*, [1 x i64] }, [1 x i8], [1 x i64] } %.unpack.unpack, 2, 0
%.fca.0.1.2.0.extract = extractvalue { { i8 addrspace(1)*, [1 x i64] }, [1 x i8], [1 x i64] } %.unpack.unpack10, 2, 0
%4 = load i64, i64 addrspace(1)* %2, align 8
%5 = zext i32 %threads_per_grid to i64
%.not7 = icmp sgt i64 %4, 0
br i1 %.not7, label %L5.lr.ph, label %common.ret
L5.lr.ph: ; preds = %conversion
%.fca.0.1.1.0.extract = extractvalue { { i8 addrspace(1)*, [1 x i64] }, [1 x i8], [1 x i64] } %.unpack.unpack10, 1, 0
%.fca.0.1.0.0.extract = extractvalue { { i8 addrspace(1)*, [1 x i64] }, [1 x i8], [1 x i64] } %.unpack.unpack10, 0, 0
%.fca.0.0.1.0.extract = extractvalue { { i8 addrspace(1)*, [1 x i64] }, [1 x i8], [1 x i64] } %.unpack.unpack, 1, 0
%.fca.0.0.0.0.extract = extractvalue { { i8 addrspace(1)*, [1 x i64] }, [1 x i8], [1 x i64] } %.unpack.unpack, 0, 0
%6 = bitcast { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %0 to i128 addrspace(1)* addrspace(1)*
%.unpack6 = load i128 addrspace(1)*, i128 addrspace(1)* addrspace(1)* %6, align 8
%7 = add i32 %thread_position_in_grid, 1
%8 = zext i32 %7 to i64
%.not2 = icmp eq i8 %.fca.0.0.1.0.extract, 0
%9 = bitcast i8 addrspace(1)* %.fca.0.0.0.0.extract to i128 addrspace(1)*
%.not4 = icmp eq i8 %.fca.0.1.1.0.extract, 0
%10 = bitcast i8 addrspace(1)* %.fca.0.1.0.0.extract to i128 addrspace(1)*
br label %L5
L5: ; preds = %L20, %L5.lr.ph
%value_phi8 = phi i64 [ 0, %L5.lr.ph ], [ %13, %L20 ]
%11 = mul i64 %value_phi8, %5
%12 = add i64 %11, %8
%.not1 = icmp slt i64 %.unpack4.unpack, %12
br i1 %.not1, label %common.ret, label %L20
common.ret: ; preds = %L20, %L5, %conversion
ret void
L20: ; preds = %L5
%13 = add nuw nsw i64 %value_phi8, 1
%14 = select i1 %.not2, i64 %.fca.0.0.2.0.extract, i64 %12
%15 = add i64 %14, -1
%16 = getelementptr inbounds i128, i128 addrspace(1)* %9, i64 %15
%17 = load i128, i128 addrspace(1)* %16, align 16, !tbaa !21
%18 = select i1 %.not4, i64 %.fca.0.1.2.0.extract, i64 %12
%19 = add i64 %18, -1
%20 = getelementptr inbounds i128, i128 addrspace(1)* %10, i64 %19
%21 = load i128, i128 addrspace(1)* %20, align 16, !tbaa !21
%22 = mul i128 %21, %17
%23 = add i64 %12, -1
%24 = getelementptr inbounds i128, i128 addrspace(1)* %.unpack6, i64 %23
store i128 %22, i128 addrspace(1)* %24, align 16, !tbaa !21
%.not = icmp slt i64 %13, %4
br i1 %.not, label %L5, label %common.ret
}
!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6, !7, !8}
!julia.kernel = !{!9}
!air.kernel = !{!10}
!llvm.ident = !{!18}
!air.version = !{!19}
!air.language_version = !{!20}
!0 = !{i32 2, !"Dwarf Version", i32 4}
!1 = !{i32 2, !"Debug Info Version", i32 3}
!2 = !{i32 7, !"air.max_device_buffers", i32 31}
!3 = !{i32 7, !"air.max_constant_buffers", i32 31}
!4 = !{i32 7, !"air.max_threadgroup_buffers", i32 31}
!5 = !{i32 7, !"air.max_textures", i32 128}
!6 = !{i32 7, !"air.max_read_write_textures", i32 8}
!7 = !{i32 7, !"air.max_samplers", i32 16}
!8 = !{i32 2, !"SDK Version", [3 x i32] [i32 14, i32 3, i32 1]}
!9 = !{void ({ i8 addrspace(1)*, [1 x i64] } addrspace(1)*, { [2 x { { i8 addrspace(1)*, [1 x i64] }, [1 x i8], [1 x i64] }], [1 x [1 x i64]] } addrspace(1)*, i64 addrspace(1)*, i32, i32)* @_Z16broadcast_kernel16mtlKernelContext14MtlDeviceArrayI6Int128Li1ELi1EE11BroadcastedI13MtlArrayStyleILi1E39Metal_MTL_MTLResourceStorageModePrivateE5TupleI5OneToI5Int64EE1_S4_I8ExtrudedIS0_IS1_Li1ELi1EES4_I4BoolES4_IS6_EES8_IS0_IS1_Li1ELi1EES4_IS9_ES4_IS6_EEEES6_}
!10 = !{void ({ i8 addrspace(1)*, [1 x i64] } addrspace(1)*, { [2 x { { i8 addrspace(1)*, [1 x i64] }, [1 x i8], [1 x i64] }], [1 x [1 x i64]] } addrspace(1)*, i64 addrspace(1)*, i32, i32)* @_Z16broadcast_kernel16mtlKernelContext14MtlDeviceArrayI6Int128Li1ELi1EE11BroadcastedI13MtlArrayStyleILi1E39Metal_MTL_MTLResourceStorageModePrivateE5TupleI5OneToI5Int64EE1_S4_I8ExtrudedIS0_IS1_Li1ELi1EES4_I4BoolES4_IS6_EES8_IS0_IS1_Li1ELi1EES4_IS9_ES4_IS6_EEEES6_, !11, !12}
!11 = !{}
!12 = !{!13, !14, !15, !16, !17}
!13 = !{i32 0, !"air.buffer", !"air.location_index", i32 0, i32 1, !"air.read_write", !"air.address_space", i32 1, !"air.arg_type_size", i32 16, !"air.arg_type_align_size", i32 8, !"air.arg_type_name", !"MtlDeviceVector{Int128, 1}", !"air.arg_name", !"dest"}
!14 = !{i32 1, !"air.buffer", !"air.location_index", i32 1, i32 1, !"air.read_write", !"air.address_space", i32 1, !"air.arg_type_size", i32 72, !"air.arg_type_align_size", i32 8, !"air.arg_type_name", !"Base.Broadcast.Broadcasted{Metal.MtlArrayStyle{1, Metal.MTL.MTLResourceStorageModePrivate}, Tuple{Base.OneTo{Int64}}, typeof(*), Tuple{Base.Broadcast.Extruded{MtlDeviceVector{Int128, 1}, Tuple{Bool}, Tuple{Int64}}, Base.Broadcast.Extruded{MtlDeviceVector{Int128, 1}, Tuple{Bool}, Tuple{Int64}}}}", !"air.arg_name", !"bc\E2"}
!15 = !{i32 2, !"air.buffer", !"air.location_index", i32 2, i32 1, !"air.read_write", !"air.address_space", i32 1, !"air.arg_type_size", i32 8, !"air.arg_type_align_size", i32 8, !"air.arg_type_name", !"Int64", !"air.arg_name", !"nelem"}
!16 = !{i32 3, !"air.threads_per_grid", !"air.arg_type_name", !"uint"}
!17 = !{i32 4, !"air.thread_position_in_grid", !"air.arg_type_name", !"uint"}
!18 = !{!"Julia 1.10.1 with Metal.jl"}
!19 = !{i32 3, i32 0, i32 0}
!20 = !{!"Metal", i32 3, i32 0, i32 0}
!21 = !{!22, !22, i64 0, i64 0}
!22 = !{!"custom_tbaa_addrspace(1)", !23, i64 0}
!23 = !{!"custom_tbaa"} |
@maleadt Metal doesn't support |
Can we have an implementation where it accepts int128 but uses two int64 under the hood? I assume thats how the actual julia int128 works too, since ARM uses 64 bit architecture. This would be very useful for cryptographic computations (my current use case). |
We generally don't do these kind of automatic transformations with Julia's GPU back-ends. The user is still responsible for using the package in a way that's compatible with the hardware/toolkit, much like how Float64 is currently not supported. In addition, it would be better to have an external package/type that supports all kinds of targets, like DoubleDouble.jl for floating point, rather than special casing compilation for Int128+Metal.jl.
Yeah, it would be good to error out early on instead of running into LLVM back-end issues. And like I mentioned about BigFloat, I don't think we should automatically demote to Int64. |
It seems Metal.jl does not support Int128 type fully. It does not give an error when creating a MtlArray{Int128}() but when I try to do operations on them it gives compilation error:
Since the documentation does not say anything about this restriction, I am not sure if this is by design or a bug.
The text was updated successfully, but these errors were encountered: