diff --git a/sycl-fusion/passes/internalization/Internalization.cpp b/sycl-fusion/passes/internalization/Internalization.cpp index 394c77d09387b..1604d7e104ac0 100644 --- a/sycl-fusion/passes/internalization/Internalization.cpp +++ b/sycl-fusion/passes/internalization/Internalization.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -25,6 +26,7 @@ #define DEBUG_TYPE "sycl-fusion" using namespace llvm; +using namespace PatternMatch; constexpr static StringLiteral PrivatePromotion{"private"}; constexpr static StringLiteral LocalPromotion{"local"}; @@ -191,22 +193,10 @@ static void updateInternalizationMD(Function *F, StringRef Kind, /// address space has changed from N to N / LocalSize. static void remap(GetElementPtrInst *GEPI, const PromotionInfo &PromInfo) { IRBuilder<> Builder{GEPI}; - Value *C0 = Builder.getInt64(0); - - auto NIdx = GEPI->getNumIndices(); - if (NIdx > 1) { - // `GEPI` indexes into an aggregate. If the first index is 0, the base - // pointer is used as-is and we do not need to perform remapping. This is - // the common case. - // TODO: Support non-zero pointer offset, too. If the pointer operand is - // a GEP as well, we must check if the source element types match. - assert(GEPI->idx_begin()->get() == C0); - return; - } if (PromInfo.LocalSize == 1) { // Squash the index and let instcombine clean-up afterwards. - GEPI->idx_begin()->set(C0); + GEPI->idx_begin()->set(Builder.getInt64(0)); return; } @@ -290,6 +280,43 @@ Error SYCLInternalizerImpl::canPromoteCall(CallBase *C, const Value *Val, return Error::success(); } +enum GEPKind { INVALID = 0, NEEDS_REMAPPING, ADDRESSES_INTO_AGGREGATE }; + +static int getGEPKind(GetElementPtrInst *GEPI, const PromotionInfo &PromInfo) { + assert(GEPI->getNumIndices() >= 1 && "No-op GEP encountered"); + + // Inspect the GEP's source element type. + auto &DL = GEPI->getModule()->getDataLayout(); + auto SrcElemTySz = DL.getTypeAllocSize(GEPI->getSourceElementType()); + + // `GEPI`'s first index is selecting elements. Unless it is constant zero, we + // have to remap. If there are more indices, we start to address into an + // aggregate type. + if (SrcElemTySz == PromInfo.ElemSize) { + int Kind = INVALID; + if (!match(GEPI->idx_begin()->get(), m_ZeroInt())) + Kind |= NEEDS_REMAPPING; + if (GEPI->getNumIndices() >= 2) + Kind |= ADDRESSES_INTO_AGGREGATE; + assert(Kind != INVALID && "No-op GEP encountered"); + return Kind; + } + + // Check whether `GEPI` adds a constant offset, e.g. a byte offset to address + // into a padded structure, smaller than the element size. + MapVector VariableOffsets; + auto IW = DL.getIndexSizeInBits(GEPI->getPointerAddressSpace()); + APInt ConstantOffset = APInt::getZero(IW); + if (GEPI->collectOffset(DL, IW, VariableOffsets, ConstantOffset) && + VariableOffsets.empty() && + ConstantOffset.getZExtValue() < PromInfo.ElemSize) { + return ADDRESSES_INTO_AGGREGATE; + } + + // We don't know what `GEPI` addresses; bail out. + return INVALID; +} + Error SYCLInternalizerImpl::canPromoteGEP(GetElementPtrInst *GEPI, const Value *Val, const PromotionInfo &PromInfo, @@ -299,12 +326,17 @@ Error SYCLInternalizerImpl::canPromoteGEP(GetElementPtrInst *GEPI, // required. return Error::success(); } - // Recurse to check all users of the GEP. We are either already in - // `InAggregate` mode, or inspect the current instruction. Recall that a GEP's - // first index is used to step through the base pointer, whereas any - // additional indices represent addressing into an aggregrate type. + + // Inspect the current instruction. + auto Kind = getGEPKind(GEPI, PromInfo); + if (Kind == INVALID) { + return createStringError(inconvertibleErrorCode(), + "Unsupported pointer arithmetic"); + } + + // Recurse to check all users of the GEP. return canPromoteValue(GEPI, PromInfo, - InAggregate || GEPI->getNumIndices() >= 2); + InAggregate || (Kind & ADDRESSES_INTO_AGGREGATE)); } Error SYCLInternalizerImpl::canPromoteValue(Value *Val, @@ -423,15 +455,17 @@ void SYCLInternalizerImpl::promoteGEPI(GetElementPtrInst *GEPI, bool InAggregate) const { // Not PointerType is unreachable. Other case is caught in caller. if (cast(GEPI->getType())->getAddressSpace() != AS) { - if (!InAggregate) + auto Kind = getGEPKind(GEPI, PromInfo); + assert(Kind != INVALID); + + if (!InAggregate && (Kind & NEEDS_REMAPPING)) { remap(GEPI, PromInfo); + } GEPI->mutateType(PointerType::get(GEPI->getContext(), AS)); - // Recurse to promote to all users of the GEP. We are either already in - // `InAggregate` mode, or inspect the current instruction. Recall that a - // GEP's first index is used to step through the base pointer, whereas any - // additional indices represent addressing into an aggregrate type. + + // Recurse to promote to all users of the GEP. return promoteValue(GEPI, PromInfo, - InAggregate || GEPI->getNumIndices() >= 2); + InAggregate || (Kind & ADDRESSES_INTO_AGGREGATE)); } } diff --git a/sycl-fusion/test/internalization/promote-private-non-unit-cuda.ll b/sycl-fusion/test/internalization/promote-private-non-unit-cuda.ll new file mode 100644 index 0000000000000..d786ab1529667 --- /dev/null +++ b/sycl-fusion/test/internalization/promote-private-non-unit-cuda.ll @@ -0,0 +1,116 @@ +; REQUIRES: cuda +; RUN: opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext \ +; RUN: -passes=sycl-internalization -S %s | FileCheck %s + +; This test is a reduced IR version of +; sycl/test-e2e/KernelFusion/internalize_non_unit_localsize.cpp for CUDA + +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +%"class.sycl::_V1::range" = type { %"class.sycl::_V1::detail::array" } +%"class.sycl::_V1::detail::array" = type { [1 x i64] } +%struct.MyStruct = type { i32, %"class.sycl::_V1::vec" } +%"class.sycl::_V1::vec" = type { <3 x i32> } + +declare noundef i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0 +declare noundef i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #0 +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 +declare ptr @llvm.nvvm.implicit.offset() #1 + +define void @fused_0(ptr addrspace(1) nocapture noundef align 16 %KernelOne__arg_accTmp, + ptr nocapture noundef readonly byval(%"class.sycl::_V1::range") align 8 %KernelOne__arg_accTmp3, + ptr addrspace(1) nocapture noundef readonly align 4 %KernelOne__arg_accIn, + ptr nocapture noundef readonly byval(%"class.sycl::_V1::range") align 8 %KernelOne__arg_accIn6, + ptr addrspace(1) nocapture noundef align 1 %KernelOne__arg_accTmp27, + ptr nocapture noundef readonly byval(%"class.sycl::_V1::range") align 8 %KernelOne__arg_accTmp210, + ptr addrspace(1) nocapture noundef writeonly align 4 %KernelTwo__arg_accOut, + ptr nocapture noundef readonly byval(%"class.sycl::_V1::range") align 8 %KernelTwo__arg_accOut3) + local_unnamed_addr #3 !sycl.kernel.promote !13 !sycl.kernel.promote.localsize !14 !sycl.kernel.promote.elemsize !15 { +; CHECK-LABEL: define void @fused_0( +; CHECK-SAME: ptr nocapture noundef readonly byval(%"class.sycl::_V1::range") align 8 [[KERNELONE__ARG_ACCTMP3:%[^,]*accTmp3]], +; CHECK-SAME: ptr nocapture noundef readonly byval(%"class.sycl::_V1::range") align 8 [[KERNELONE__ARG_ACCTMP210:%[^,]*accTmp210]] +; CHECK: entry: +; CHECK: [[TMP0:%.*]] = alloca i8, i64 3, align 1 +; CHECK: [[TMP1:%.*]] = alloca i8, i64 96, align 16 +; CHECK: [[KERNELONE__ARG_ACCTMP2103_SROA_0_0_COPYLOAD:%.*]] = load i64, ptr [[KERNELONE__ARG_ACCTMP210]], align 8 +; CHECK: [[KERNELONE__ARG_ACCTMP31_SROA_0_0_COPYLOAD:%.*]] = load i64, ptr [[KERNELONE__ARG_ACCTMP3]], align 8 +; CHECK: [[TMP2:%.*]] = urem i64 [[KERNELONE__ARG_ACCTMP31_SROA_0_0_COPYLOAD]], 3 +; CHECK: [[TMP3:%.*]] = urem i64 [[KERNELONE__ARG_ACCTMP2103_SROA_0_0_COPYLOAD]], 3 +; CHECK: [[MUL:%.*]] = mul nuw nsw i64 [[GLOBAL_ID:.*]], 3 +; CHECK: [[ADD:%.*]] = add nuw nsw i64 [[MUL]], 1 +; CHECK: [[TMP10:%.*]] = add i64 [[TMP2]], [[ADD]] +; CHECK: [[TMP11:%.*]] = urem i64 [[TMP10]], 3 +; CHECK: [[ARRAYIDX_1:%.*]] = getelementptr inbounds %struct.MyStruct, ptr [[TMP1]], i64 [[TMP11]] + +; COM: This i8-GEP _was_ not remapped because it addresses into a single MyStruct element +; CHECK: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX_1]], i64 20 +; CHECK: store i32 {{.*}}, ptr [[ARRAYIDX_2]], align 4 +; CHECK: [[TMP12:%.*]] = add i64 [[TMP3]], [[ADD]] +; CHECK: [[TMP13:%.*]] = urem i64 [[TMP12]], 3 + +; COM: This i8-GEP was remapped because it selects an element of the underlying i8-buffer +; CHECK: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 [[TMP13]] + +; CHECK: store i8 {{.*}}, ptr [[ARRAYIDX_3]], align 1 +; CHECK: store i32 {{.*}}, ptr addrspace(1) +; CHECK: ret void +; +entry: + %KernelOne__arg_accTmp2103.sroa.0.0.copyload = load i64, ptr %KernelOne__arg_accTmp210, align 8 + %KernelOne__arg_accIn62.sroa.0.0.copyload = load i64, ptr %KernelOne__arg_accIn6, align 8 + %KernelOne__arg_accTmp31.sroa.0.0.copyload = load i64, ptr %KernelOne__arg_accTmp3, align 8 + %add.ptr.j2 = getelementptr inbounds %struct.MyStruct, ptr addrspace(1) %KernelOne__arg_accTmp, i64 %KernelOne__arg_accTmp31.sroa.0.0.copyload + %add.ptr.i37.i = getelementptr inbounds i32, ptr addrspace(1) %KernelOne__arg_accIn, i64 %KernelOne__arg_accIn62.sroa.0.0.copyload + %add.ptr.i43.i = getelementptr inbounds i8, ptr addrspace(1) %KernelOne__arg_accTmp27, i64 %KernelOne__arg_accTmp2103.sroa.0.0.copyload + %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() + %conv.i1.j7 = sext i32 %0 to i64 + %1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + %conv.i3.j7 = sext i32 %1 to i64 + %mul.j7 = mul nsw i64 %conv.i3.j7, %conv.i1.j7 + %2 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %conv.i2.j7 = sext i32 %2 to i64 + %add.j7 = add nsw i64 %mul.j7, %conv.i2.j7 + %3 = tail call ptr @llvm.nvvm.implicit.offset() + %4 = load i32, ptr %3, align 4 + %conv.j8 = zext i32 %4 to i64 + %add4.j7 = add nsw i64 %add.j7, %conv.j8 + %mul.j2 = mul nuw nsw i64 %add4.j7, 3 + %add.j2 = add nuw nsw i64 %mul.j2, 1 + %arrayidx.j2 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr.i37.i, i64 %add.j2 + %5 = load i32, ptr addrspace(1) %arrayidx.j2, align 4 + %arrayidx.i55.i = getelementptr inbounds %struct.MyStruct, ptr addrspace(1) %add.ptr.j2, i64 %add.j2 + %arrayidx.j3 = getelementptr inbounds i8, ptr addrspace(1) %arrayidx.i55.i, i64 20 + store i32 %5, ptr addrspace(1) %arrayidx.j3, align 4 + %conv.j2 = trunc i32 %5 to i8 + %arrayidx.i73.i = getelementptr inbounds i8, ptr addrspace(1) %add.ptr.i43.i, i64 %add.j2 + store i8 %conv.j2, ptr addrspace(1) %arrayidx.i73.i, align 1 + %KernelTwo__arg_accOut34.sroa.0.0.copyload = load i64, ptr %KernelTwo__arg_accOut3, align 8 + %add.ptr.i.i7 = getelementptr inbounds i32, ptr addrspace(1) %KernelTwo__arg_accOut, i64 %KernelTwo__arg_accOut34.sroa.0.0.copyload + %6 = load i32, ptr %3, align 4 + %conv.j7.i13 = zext i32 %6 to i64 + %add4.j6.i14 = add nsw i64 %add.j7, %conv.j7.i13 + %mul.i.i16 = mul nuw nsw i64 %add4.j6.i14, 3 + %add.i45.i = add nuw nsw i64 %mul.i.i16, 1 + %arrayidx.i.i17 = getelementptr inbounds %struct.MyStruct, ptr addrspace(1) %add.ptr.j2, i64 %add.i45.i + %arrayidx.j2.i19 = getelementptr inbounds i8, ptr addrspace(1) %arrayidx.i.i17, i64 20 + %7 = load i32, ptr addrspace(1) %arrayidx.j2.i19, align 4 + %arrayidx.i55.i20 = getelementptr inbounds i8, ptr addrspace(1) %add.ptr.i43.i, i64 %add.i45.i + %8 = load i8, ptr addrspace(1) %arrayidx.i55.i20, align 1 + %conv.i.i22 = sext i8 %8 to i32 + %add.i.i23 = add nsw i32 %7, %conv.i.i22 + %arrayidx.i64.i = getelementptr inbounds i32, ptr addrspace(1) %add.ptr.i.i7, i64 %add.i45.i + store i32 %add.i.i23, ptr addrspace(1) %arrayidx.i64.i, align 4 + ret void +} + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { nofree nosync nounwind speculatable memory(none) } +attributes #3 = { nofree nosync nounwind memory(read, argmem: readwrite, inaccessiblemem: write) "frame-pointer"="all" "target-cpu"="sm_80" "target-features"="+ptx82,+sm_80" "uniform-work-group-size"="true" } + +!nvvm.annotations = !{!10} + +!10 = !{ptr @fused_0, !"kernel", i32 1} +!13 = !{!"private", !"none", !"none", !"none", !"private", !"none", !"none", !"none"} +!14 = !{i64 3, !"", !"", !"", i64 3, !"", !"", !""} +!15 = !{i64 32, !"", !"", !"", i64 1, !"", !"", !""} diff --git a/sycl-fusion/test/internalization/promote-private-non-unit-hip.ll b/sycl-fusion/test/internalization/promote-private-non-unit-hip.ll new file mode 100644 index 0000000000000..b08d7ba472e57 --- /dev/null +++ b/sycl-fusion/test/internalization/promote-private-non-unit-hip.ll @@ -0,0 +1,137 @@ +; REQUIRES: hip_amd +; RUN: opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext \ +; RUN: -passes=sycl-internalization -S %s | FileCheck %s + +; This test is the IR version of +; sycl/test-e2e/KernelFusion/internalize_non_unit_localsize.cpp for HIP. +; In contrast to the SPIR-V and CUDA versions, the sycl::vec in the test data +; structure is addressed via a multi-index GEP with a non-zero first index. + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8" +target triple = "amdgcn-amd-amdhsa" + +%struct.MyStruct = type { i32, %"class.sycl::_V1::vec" } +%"class.sycl::_V1::vec" = type { <3 x i32> } + +declare i32 @llvm.amdgcn.workgroup.id.x() #0 +declare align 4 ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0 +declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare ptr addrspace(5) @llvm.amdgcn.implicit.offset() #1 + +define amdgpu_kernel void @fused_0(ptr addrspace(1) noundef align 16 %KernelOne__arg_accTmp31, + i64 %KernelOne__arg_accTmp.coerce, + i64 %KernelOne__arg_accTmp.coerce3, + i64 %KernelOne__arg_accTmp.coerce7, + ptr addrspace(1) noundef align 4 %KernelOne__arg_accIn35, + i64 %KernelOne__arg_accIn.coerce, + i64 %KernelOne__arg_accIn.coerce13, + i64 %KernelOne__arg_accIn.coerce17, + ptr addrspace(1) noundef align 1 %KernelOne__arg_accTmp239, + i64 %KernelOne__arg_accTmp2.coerce, + i64 %KernelOne__arg_accTmp2.coerce24, + i64 %KernelOne__arg_accTmp2.coerce28, + ptr addrspace(1) noundef align 4 %KernelTwo__arg_accOut30, + i64 %KernelTwo__arg_accOut.coerce, + i64 %KernelTwo__arg_accOut.coerce3, + i64 %KernelTwo__arg_accOut.coerce7) + #3 !sycl.kernel.promote !12 !sycl.kernel.promote.localsize !13 !sycl.kernel.promote.elemsize !14 { +; CHECK-LABEL: define amdgpu_kernel void @fused_0( +; CHECK-SAME: i64 [[KERNELONE__ARG_ACCTMP_COERCE7:%[^,]*accTmp.coerce7]] +; CHECK-SAME: i64 [[KERNELONE__ARG_ACCTMP2_COERCE28:%[^,]*accTmp2.coerce28]] +; CHECK: entry: +; CHECK: [[TMP0:%.*]] = alloca i8, i64 3, align 1, addrspace(5) +; CHECK: [[TMP1:%.*]] = alloca i8, i64 96, align 16, addrspace(5) +; CHECK: [[TMP2:%.*]] = urem i64 [[KERNELONE__ARG_ACCTMP_COERCE7]], 3 +; CHECK: [[TMP3:%.*]] = urem i64 [[KERNELONE__ARG_ACCTMP2_COERCE28]], 3 +; CHECK: [[MUL:%.*]] = mul nuw nsw i64 [[GLOBAL_ID:.*]], 3 +; CHECK: [[ADD:%.*]] = add nuw nsw i64 [[MUL]], 1 +; CHECK: [[TMP11:%.*]] = add i64 [[TMP2]], [[ADD]] +; CHECK: [[TMP12:%.*]] = urem i64 [[TMP11]], 3 + +; COM: This is a multi-index GEP into the aggregate. We have to remap the first index. +; CHECK: [[V:%.*]] = getelementptr inbounds %struct.MyStruct, ptr addrspace(5) [[TMP1]], i64 [[TMP12]], i32 1 + +; COM: This is a single-index GEP which shall not be remapped because its pointer operand already points into the struct (see above). +; CHECK: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr addrspace(5) [[V]], i64 1 + +; CHECK: store i32 {{.*}}, ptr addrspace(5) [[ARRAYIDX_1]], align 4 +; CHECK: [[TMP13:%.*]] = add i64 [[TMP3]], [[ADD]] +; CHECK: [[TMP14:%.*]] = urem i64 [[TMP13]], 3 + +; COM: This i8-GEP was remapped because it selects an element of the underlying i8-buffer +; CHECK: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[TMP0]], i64 [[TMP14]] +; CHECK: store i8 {{.*}}, ptr addrspace(5) [[ARRAYIDX_2]], align 1 +; CHECK: store i32 {{.*}} ptr addrspace(1) +; CHECK: ret void +; +entry: + %add.ptr.j2 = getelementptr inbounds %struct.MyStruct, ptr addrspace(1) %KernelOne__arg_accTmp31, i64 %KernelOne__arg_accTmp.coerce7 + %add.ptr.i82.i = getelementptr inbounds i32, ptr addrspace(1) %KernelOne__arg_accIn35, i64 %KernelOne__arg_accIn.coerce17 + %add.ptr.i85.i = getelementptr inbounds i8, ptr addrspace(1) %KernelOne__arg_accTmp239, i64 %KernelOne__arg_accTmp2.coerce28 + %0 = call i32 @llvm.amdgcn.workgroup.id.x() + %conv.i1.j7 = zext i32 %0 to i64 + %1 = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() + %arrayidx.j8 = getelementptr inbounds i16, ptr addrspace(4) %1, i64 2 + %2 = load i16, ptr addrspace(4) %arrayidx.j8, align 4 + %conv.j8 = zext i16 %2 to i64 + %mul.j7 = mul nuw nsw i64 %conv.j8, %conv.i1.j7 + %3 = call i32 @llvm.amdgcn.workitem.id.x(), !range !20, !noundef !21 + %conv.i2.j7 = zext nneg i32 %3 to i64 + %add.j7 = add nuw nsw i64 %mul.j7, %conv.i2.j7 + %4 = call ptr addrspace(5) @llvm.amdgcn.implicit.offset() + %5 = load i32, ptr addrspace(5) %4, align 4 + %zext.j8 = zext i32 %5 to i64 + %add4.j7 = add nuw nsw i64 %add.j7, %zext.j8 + %mul.j2 = mul nuw nsw i64 %add4.j7, 3 + %add.j2 = add nuw nsw i64 %mul.j2, 1 + %arrayidx.j2 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr.i82.i, i64 %add.j2 + %6 = load i32, ptr addrspace(1) %arrayidx.j2, align 4 + %v.j2 = getelementptr inbounds %struct.MyStruct, ptr addrspace(1) %add.ptr.j2, i64 %add.j2, i32 1 + %arrayidx.j3 = getelementptr inbounds i32, ptr addrspace(1) %v.j2, i64 1 + store i32 %6, ptr addrspace(1) %arrayidx.j3, align 4 + %conv.j2 = trunc i32 %6 to i8 + %arrayidx.i104.i = getelementptr inbounds i8, ptr addrspace(1) %add.ptr.i85.i, i64 %add.j2 + store i8 %conv.j2, ptr addrspace(1) %arrayidx.i104.i, align 1 + %add.ptr.i.i1 = getelementptr inbounds i32, ptr addrspace(1) %KernelTwo__arg_accOut30, i64 %KernelTwo__arg_accOut.coerce7 + %add.ptr.i81.i = getelementptr inbounds %struct.MyStruct, ptr addrspace(1) %KernelOne__arg_accTmp31, i64 %KernelOne__arg_accTmp.coerce7 + %add.ptr.i84.i = getelementptr inbounds i8, ptr addrspace(1) %KernelOne__arg_accTmp239, i64 %KernelOne__arg_accTmp2.coerce28 + %7 = call i32 @llvm.amdgcn.workgroup.id.x() + %conv.i1.j6.i2 = zext i32 %7 to i64 + %8 = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() + %arrayidx.j7.i3 = getelementptr inbounds i16, ptr addrspace(4) %8, i64 2 + %9 = load i16, ptr addrspace(4) %arrayidx.j7.i3, align 4 + %conv.j7.i4 = zext i16 %9 to i64 + %mul.j6.i5 = mul nuw nsw i64 %conv.j7.i4, %conv.i1.j6.i2 + %10 = call i32 @llvm.amdgcn.workitem.id.x(), !range !20, !noundef !21 + %conv.i2.j6.i6 = zext nneg i32 %10 to i64 + %add.j6.i7 = add nuw nsw i64 %mul.j6.i5, %conv.i2.j6.i6 + %11 = call ptr addrspace(5) @llvm.amdgcn.implicit.offset() + %12 = load i32, ptr addrspace(5) %11, align 4 + %zext.j7.i8 = zext i32 %12 to i64 + %add4.j6.i9 = add nuw nsw i64 %add.j6.i7, %zext.j7.i8 + %mul.i.i11 = mul nuw nsw i64 %add4.j6.i9, 3 + %add.i87.i = add nuw nsw i64 %mul.i.i11, 1 + %v.i.i12 = getelementptr inbounds %struct.MyStruct, ptr addrspace(1) %add.ptr.i81.i, i64 %add.i87.i, i32 1 + %arrayidx.j2.i13 = getelementptr inbounds i32, ptr addrspace(1) %v.i.i12, i64 1 + %13 = load i32, ptr addrspace(1) %arrayidx.j2.i13, align 4 + %arrayidx.i92.i = getelementptr inbounds i8, ptr addrspace(1) %add.ptr.i84.i, i64 %add.i87.i + %14 = load i8, ptr addrspace(1) %arrayidx.i92.i, align 1 + %conv.i.i14 = sext i8 %14 to i32 + %add.i.i15 = add nsw i32 %13, %conv.i.i14 + %arrayidx.i98.i = getelementptr inbounds i32, ptr addrspace(1) %add.ptr.i.i1, i64 %add.i87.i + store i32 %add.i.i15, ptr addrspace(1) %arrayidx.i98.i, align 4 + ret void +} + +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { nounwind speculatable memory(none) } +attributes #3 = { "frame-pointer"="all" "target-cpu"="gfx1031" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" "uniform-work-group-size"="true" } + +!amdgcn.annotations = !{!9} + +!9 = !{ptr @fused_0, !"kernel", i32 1} +!12 = !{!"private", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"private", !"none", !"none", !"none", !"none", !"none", !"none", !"none"} +!13 = !{i64 3, !"", !"", !"", !"", !"", !"", !"", i64 3, !"", !"", !"", !"", !"", !"", !""} +!14 = !{i64 32, !"", !"", !"", !"", !"", !"", !"", i64 1, !"", !"", !"", !"", !"", !"", !""} +!20 = !{i32 0, i32 1024} +!21 = !{} diff --git a/sycl-fusion/test/internalization/promote-private-non-unit.ll b/sycl-fusion/test/internalization/promote-private-non-unit.ll new file mode 100644 index 0000000000000..b4b73aa9881b9 --- /dev/null +++ b/sycl-fusion/test/internalization/promote-private-non-unit.ll @@ -0,0 +1,91 @@ +; RUN: opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext \ +; RUN: -passes=sycl-internalization -S %s | FileCheck %s + +; This test is a reduced IR version of +; sycl/test-e2e/KernelFusion/internalize_non_unit_localsize.cpp + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +target triple = "spir64-unknown-unknown" + +%"class.sycl::_V1::range" = type { %"class.sycl::_V1::detail::array" } +%"class.sycl::_V1::detail::array" = type { [1 x i64] } +%struct.MyStruct = type { i32, %"class.sycl::_V1::vec" } +%"class.sycl::_V1::vec" = type { <3 x i32> } + +declare spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32) local_unnamed_addr #1 + +define spir_kernel void @fused_0(ptr addrspace(1) nocapture align 16 %KernelOne__arg_accTmp, + ptr nocapture readonly byval(%"class.sycl::_V1::range") align 8 %KernelOne__arg_accTmp3, + ptr addrspace(1) nocapture readonly align 4 %KernelOne__arg_accIn, + ptr nocapture readonly byval(%"class.sycl::_V1::range") align 8 %KernelOne__arg_accIn6, + ptr addrspace(1) nocapture writeonly align 1 %KernelOne__arg_accTmp27, + ptr nocapture readonly byval(%"class.sycl::_V1::range") align 8 %KernelOne__arg_accTmp210, + ptr addrspace(1) nocapture writeonly align 4 %KernelTwo__arg_accOut, + ptr nocapture readonly byval(%"class.sycl::_V1::range") align 8 %KernelTwo__arg_accOut3) + local_unnamed_addr #2 !sycl.kernel.promote !11 !sycl.kernel.promote.localsize !12 !sycl.kernel.promote.elemsize !13 { +; CHECK-LABEL: define spir_kernel void @fused_0( +; CHECK-SAME: ptr nocapture readonly byval(%"class.sycl::_V1::range") align 8 [[KERNELONE__ARG_ACCTMP3:%[^,]*accTmp3]], +; CHECK-SAME: ptr nocapture readonly byval(%"class.sycl::_V1::range") align 8 [[KERNELONE__ARG_ACCTMP210:%[^,]*accTmp210]] +; CHECK: entry: +; CHECK: [[TMP0:%.*]] = alloca i8, i64 3, align 1 +; CHECK: [[TMP1:%.*]] = alloca i8, i64 96, align 16 +; CHECK: [[KERNELONE__ARG_ACCTMP2103_SROA_0_0_COPYLOAD:%.*]] = load i64, ptr [[KERNELONE__ARG_ACCTMP210]], align 8 +; CHECK: [[KERNELONE__ARG_ACCTMP31_SROA_0_0_COPYLOAD:%.*]] = load i64, ptr [[KERNELONE__ARG_ACCTMP3]], align 8 +; CHECK: [[TMP2:%.*]] = urem i64 [[KERNELONE__ARG_ACCTMP31_SROA_0_0_COPYLOAD]], 3 +; CHECK: [[TMP3:%.*]] = urem i64 [[KERNELONE__ARG_ACCTMP2103_SROA_0_0_COPYLOAD]], 3 +; CHECK: [[TMP4:%.*]] = tail call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0) +; CHECK: [[MUL:%.*]] = mul nuw nsw i64 [[TMP4]], 3 +; CHECK: [[ADD:%.*]] = add nuw nsw i64 [[MUL]], 1 +; CHECK: [[TMP6:%.*]] = add i64 [[TMP2]], [[ADD]] +; CHECK: [[TMP7:%.*]] = urem i64 [[TMP6]], 3 +; CHECK: [[ARRAYIDX_1:%.*]] = getelementptr inbounds %struct.MyStruct, ptr [[TMP1]], i64 [[TMP7]] + +; COM: This i8-GEP _was_ not remapped because it addresses into a single MyStruct element +; CHECK: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX_1]], i64 20 + +; CHECK: store i32 {{.*}}, ptr [[ARRAYIDX_2]], align 4 +; CHECK: [[TMP8:%.*]] = add i64 [[TMP3]], [[ADD]] +; CHECK: [[TMP9:%.*]] = urem i64 [[TMP8]], 3 + +; COM: This i8-GEP was remapped because it selects an element of the underlying i8-buffer +; CHECK: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 [[TMP9]] + +; CHECK: store i8 {{.*}}, ptr [[ARRAYIDX_3]], align 1 +; CHECK: store i32 {{.*}}, ptr addrspace(1) +; CHECK: ret void +; +entry: + %KernelOne__arg_accTmp2103.sroa.0.0.copyload = load i64, ptr %KernelOne__arg_accTmp210, align 8 + %KernelOne__arg_accIn62.sroa.0.0.copyload = load i64, ptr %KernelOne__arg_accIn6, align 8 + %KernelOne__arg_accTmp31.sroa.0.0.copyload = load i64, ptr %KernelOne__arg_accTmp3, align 8 + %add.ptr.j2 = getelementptr inbounds %struct.MyStruct, ptr addrspace(1) %KernelOne__arg_accTmp, i64 %KernelOne__arg_accTmp31.sroa.0.0.copyload + %add.ptr.i35.i = getelementptr inbounds i32, ptr addrspace(1) %KernelOne__arg_accIn, i64 %KernelOne__arg_accIn62.sroa.0.0.copyload + %add.ptr.i44.i = getelementptr inbounds i8, ptr addrspace(1) %KernelOne__arg_accTmp27, i64 %KernelOne__arg_accTmp2103.sroa.0.0.copyload + %0 = tail call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0) #3 + %mul.j2 = mul nuw nsw i64 %0, 3 + %add.j2 = add nuw nsw i64 %mul.j2, 1 + %arrayidx.j2 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr.i35.i, i64 %add.j2 + %1 = load i32, ptr addrspace(1) %arrayidx.j2, align 4 + %arrayidx.i54.i = getelementptr inbounds %struct.MyStruct, ptr addrspace(1) %add.ptr.j2, i64 %add.j2 + %arrayidx.j3 = getelementptr inbounds i8, ptr addrspace(1) %arrayidx.i54.i, i64 20 + store i32 %1, ptr addrspace(1) %arrayidx.j3, align 4 + %conv.j2 = trunc i32 %1 to i8 + %arrayidx.i70.i = getelementptr inbounds i8, ptr addrspace(1) %add.ptr.i44.i, i64 %add.j2 + store i8 %conv.j2, ptr addrspace(1) %arrayidx.i70.i, align 1 + %KernelTwo__arg_accOut34.sroa.0.0.copyload = load i64, ptr %KernelTwo__arg_accOut3, align 8 + %add.ptr.i.i7 = getelementptr inbounds i32, ptr addrspace(1) %KernelTwo__arg_accOut, i64 %KernelTwo__arg_accOut34.sroa.0.0.copyload + %2 = load i32, ptr addrspace(1) %arrayidx.j3, align 4 + %conv.i.i13 = sext i8 %conv.j2 to i32 + %add.i.i14 = add nsw i32 %2, %conv.i.i13 + %arrayidx.i62.i = getelementptr inbounds i32, ptr addrspace(1) %add.ptr.i.i7, i64 %add.j2 + store i32 %add.i.i14, ptr addrspace(1) %arrayidx.i62.i, align 4 + ret void +} + +attributes #1 = { mustprogress nofree nosync nounwind willreturn memory(none) } +attributes #2 = { mustprogress nofree nosync nounwind willreturn memory(argmem: readwrite, inaccessiblemem: write) } +attributes #3 = { nounwind willreturn memory(none) } + +!11 = !{!"private", !"none", !"none", !"none", !"private", !"none", !"none", !"none"} +!12 = !{i64 3, !"", !"", !"", i64 3, !"", !"", !""} +!13 = !{i64 32, !"", !"", !"", i64 1, !"", !"", !""} diff --git a/sycl/test-e2e/KernelFusion/internalize_non_unit_localsize.cpp b/sycl/test-e2e/KernelFusion/internalize_non_unit_localsize.cpp new file mode 100644 index 0000000000000..3b302b733f5c5 --- /dev/null +++ b/sycl/test-e2e/KernelFusion/internalize_non_unit_localsize.cpp @@ -0,0 +1,96 @@ +// REQUIRES: fusion +// RUN: %{build} -fsycl-embed-ir -O2 -o %t.out +// RUN: %{run} %t.out + +// Test private internalization with "LocalSize" == 3 on buffers that trigger +// special cases in the GEP analyis during remapping. +// - `tmp`: +// - On SPIR-V and CUDA targets, the IR contains `i8`-typed GEPs to access the +// elements in the sycl::vec. These GEPs shall _not_ be remapped. +// - On HIP, the IR contains GEP instructions that add a pointer offset (hence +// must be remapped) _and_ address into the aggregate element. +// - `tmp2` is an `i8` buffer. The corresponding `i8`-typed GEPs must be +// remapped during internalization. + +#include + +using namespace sycl; + +struct MyStruct { + int pad; + sycl::int3 v; +}; + +int main() { + constexpr int dataSize = 384; + std::array in, out; + std::array tmp; + std::array tmp2; + + for (int i = 0; i < dataSize; ++i) { + in[i] = i; + tmp[i].v.y() = -1; + tmp2[i] = -1; + out[i] = -1; + } + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + { + buffer bIn{in.data(), range{dataSize}}; + buffer bTmp{tmp.data(), range{dataSize}}; + buffer bTmp2{tmp2.data(), range{dataSize}}; + buffer bOut{out.data(), range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn = bIn.get_access(cgh); + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + auto accTmp2 = bTmp2.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + cgh.parallel_for(range<1>{dataSize / 3}, [=](id<1> i) { + accTmp[3 * i].v.y() = accIn[3 * i]; + accTmp[3 * i + 1].v.y() = accIn[3 * i + 1]; + accTmp[3 * i + 2].v.y() = accIn[3 * i + 2]; + accTmp2[3 * i + 2] = static_cast(accIn[3 * i] ^ 0xAA); + accTmp2[3 * i + 1] = static_cast(accIn[3 * i + 1] ^ 0xAA); + accTmp2[3 * i] = static_cast(accIn[3 * i + 2] ^ 0xAA); + }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + auto accTmp2 = bTmp2.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + auto accIn = bIn.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for(range<1>{dataSize / 3}, [=](id<1> i) { + accOut[3 * i] = accTmp[3 * i].v.y() * accIn[3 * i] + accTmp2[3 * i + 2]; + accOut[3 * i + 1] = + accTmp[3 * i + 1].v.y() * accIn[3 * i + 1] + accTmp2[3 * i + 1]; + accOut[3 * i + 2] = + accTmp[3 * i + 2].v.y() * accIn[3 * i + 2] + accTmp2[3 * i]; + }); + }); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + } + + // Check the results + for (int i = 0; i < dataSize; ++i) { + assert(out[i] == (i * i + static_cast(i ^ 0xAA)) && + "Computation error"); + assert(tmp[i].v.y() == -1 && tmp2[i] == -1 && "Not internalized"); + } + + return 0; +}