From f9a89e6b9c4345df978bf7cbfedfd2b250029278 Mon Sep 17 00:00:00 2001 From: Johannes Doerfert Date: Wed, 1 Nov 2023 11:11:48 -0700 Subject: [PATCH] [OpenMP][FIX] Allocate per launch memory for GPU team reductions (#70752) We used to perform team reduction on global memory allocated in the runtime and by clang. This was racy as multiple instances of a kernel, or different kernels with team reductions, would use the same locations. Since we now have the kernel launch environment, we can allocate dynamic memory per-launch, allowing us to move all the state into a non-racy place. Fixes: https://github.com/llvm/llvm-project/issues/70249 --- clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 75 ++---- clang/lib/CodeGen/CGOpenMPRuntimeGPU.h | 2 - .../OpenMP/nvptx_teams_reduction_codegen.cpp | 240 +++++++++--------- .../target_teams_generic_loop_codegen.cpp | 20 +- .../DeviceRTL/include/Interface.h | 2 + .../libomptarget/DeviceRTL/src/Reduction.cpp | 10 +- openmp/libomptarget/include/Environment.h | 25 +- .../PluginInterface/PluginInterface.cpp | 16 +- .../parallel_target_teams_reduction.cpp | 36 +++ 9 files changed, 231 insertions(+), 195 deletions(-) create mode 100644 openmp/libomptarget/test/offloading/parallel_target_teams_reduction.cpp diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index bd9329b8e2d41..0ed665e0dfb97 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -803,8 +803,30 @@ void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF, if (!IsSPMD) emitGenericVarsEpilog(CGF); + // This is temporary until we remove the fixed sized buffer. + ASTContext &C = CGM.getContext(); + RecordDecl *StaticRD = C.buildImplicitRecord( + "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union); + StaticRD->startDefinition(); + for (const RecordDecl *TeamReductionRec : TeamsReductions) { + QualType RecTy = C.getRecordType(TeamReductionRec); + auto *Field = FieldDecl::Create( + C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy, + C.getTrivialTypeSourceInfo(RecTy, SourceLocation()), + /*BW=*/nullptr, /*Mutable=*/false, + /*InitStyle=*/ICIS_NoInit); + Field->setAccess(AS_public); + StaticRD->addDecl(Field); + } + StaticRD->completeDefinition(); + QualType StaticTy = C.getRecordType(StaticRD); + llvm::Type *LLVMReductionsBufferTy = + CGM.getTypes().ConvertTypeForMem(StaticTy); + const auto &DL = CGM.getModule().getDataLayout(); + uint64_t BufferSize = + DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue(); CGBuilderTy &Bld = CGF.Builder; - OMPBuilder.createTargetDeinit(Bld); + OMPBuilder.createTargetDeinit(Bld, BufferSize); } void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D, @@ -2998,15 +3020,10 @@ void CGOpenMPRuntimeGPU::emitReduction( CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap, C.getLangOpts().OpenMPCUDAReductionBufNum); TeamsReductions.push_back(TeamReductionRec); - if (!KernelTeamsReductionPtr) { - KernelTeamsReductionPtr = new llvm::GlobalVariable( - CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true, - llvm::GlobalValue::InternalLinkage, nullptr, - "_openmp_teams_reductions_buffer_$_$ptr"); - } - llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar( - Address(KernelTeamsReductionPtr, CGF.VoidPtrTy, CGM.getPointerAlign()), - /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc); + auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_reduction_get_fixed_buffer), + {}, "_openmp_teams_reductions_buffer_$_$ptr"); llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction( CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap); llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction( @@ -3021,7 +3038,7 @@ void CGOpenMPRuntimeGPU::emitReduction( llvm::Value *Args[] = { RTLoc, ThreadId, - GlobalBufferPtr, + KernelTeamsReductionPtr, CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum), RL, ShuffleAndReduceFn, @@ -3654,42 +3671,6 @@ void CGOpenMPRuntimeGPU::processRequiresDirective( CGOpenMPRuntime::processRequiresDirective(D); } -void CGOpenMPRuntimeGPU::clear() { - - if (!TeamsReductions.empty()) { - ASTContext &C = CGM.getContext(); - RecordDecl *StaticRD = C.buildImplicitRecord( - "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union); - StaticRD->startDefinition(); - for (const RecordDecl *TeamReductionRec : TeamsReductions) { - QualType RecTy = C.getRecordType(TeamReductionRec); - auto *Field = FieldDecl::Create( - C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy, - C.getTrivialTypeSourceInfo(RecTy, SourceLocation()), - /*BW=*/nullptr, /*Mutable=*/false, - /*InitStyle=*/ICIS_NoInit); - Field->setAccess(AS_public); - StaticRD->addDecl(Field); - } - StaticRD->completeDefinition(); - QualType StaticTy = C.getRecordType(StaticRD); - llvm::Type *LLVMReductionsBufferTy = - CGM.getTypes().ConvertTypeForMem(StaticTy); - // FIXME: nvlink does not handle weak linkage correctly (object with the - // different size are reported as erroneous). - // Restore CommonLinkage as soon as nvlink is fixed. - auto *GV = new llvm::GlobalVariable( - CGM.getModule(), LLVMReductionsBufferTy, - /*isConstant=*/false, llvm::GlobalValue::InternalLinkage, - llvm::Constant::getNullValue(LLVMReductionsBufferTy), - "_openmp_teams_reductions_buffer_$_"); - KernelTeamsReductionPtr->setInitializer( - llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, - CGM.VoidPtrTy)); - } - CGOpenMPRuntime::clear(); -} - llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; llvm::Module *M = &CGF.CGM.getModule(); diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h index 46e1361f2f895..141436f26230d 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h @@ -130,7 +130,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime { public: explicit CGOpenMPRuntimeGPU(CodeGenModule &CGM); - void clear() override; bool isGPU() const override { return true; }; @@ -386,7 +385,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime { /// Maps the function to the list of the globalized variables with their /// addresses. llvm::SmallDenseMap FunctionGlobalizedDecls; - llvm::GlobalVariable *KernelTeamsReductionPtr = nullptr; /// List of the records with the list of fields for the reductions across the /// teams. Used to build the intermediate buffer for the fast teams /// reductions. diff --git a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp index f4ec40b030a41..137ef3861751b 100644 --- a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp +++ b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp @@ -97,14 +97,14 @@ int bar(int n){ // CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 // CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 // CHECK1-NEXT: store ptr [[E1]], ptr [[TMP4]], align 8 -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 8 -// CHECK1-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr [[TMP5]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func) -// CHECK1-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 1 -// CHECK1-NEXT: br i1 [[TMP7]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK1-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer() +// CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func) +// CHECK1-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 1 +// CHECK1-NEXT: br i1 [[TMP6]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] // CHECK1: .omp.reduction.then: -// CHECK1-NEXT: [[TMP8:%.*]] = load double, ptr [[TMP0]], align 8 -// CHECK1-NEXT: [[TMP9:%.*]] = load double, ptr [[E1]], align 8 -// CHECK1-NEXT: [[ADD2:%.*]] = fadd double [[TMP8]], [[TMP9]] +// CHECK1-NEXT: [[TMP7:%.*]] = load double, ptr [[TMP0]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = load double, ptr [[E1]], align 8 +// CHECK1-NEXT: [[ADD2:%.*]] = fadd double [[TMP7]], [[TMP8]] // CHECK1-NEXT: store double [[ADD2]], ptr [[TMP0]], align 8 // CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]]) // CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]] @@ -386,21 +386,21 @@ int bar(int n){ // CHECK1-NEXT: store ptr [[C1]], ptr [[TMP6]], align 8 // CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 // CHECK1-NEXT: store ptr [[D2]], ptr [[TMP7]], align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 8 -// CHECK1-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr [[TMP8]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6) -// CHECK1-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1 -// CHECK1-NEXT: br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK1-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer() +// CHECK1-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6) +// CHECK1-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1 +// CHECK1-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] // CHECK1: .omp.reduction.then: -// CHECK1-NEXT: [[TMP11:%.*]] = load i8, ptr [[TMP0]], align 1 -// CHECK1-NEXT: [[CONV4:%.*]] = sext i8 [[TMP11]] to i32 -// CHECK1-NEXT: [[TMP12:%.*]] = load i8, ptr [[C1]], align 1 -// CHECK1-NEXT: [[CONV5:%.*]] = sext i8 [[TMP12]] to i32 +// CHECK1-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP0]], align 1 +// CHECK1-NEXT: [[CONV4:%.*]] = sext i8 [[TMP10]] to i32 +// CHECK1-NEXT: [[TMP11:%.*]] = load i8, ptr [[C1]], align 1 +// CHECK1-NEXT: [[CONV5:%.*]] = sext i8 [[TMP11]] to i32 // CHECK1-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] // CHECK1-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 // CHECK1-NEXT: store i8 [[CONV7]], ptr [[TMP0]], align 1 -// CHECK1-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP1]], align 4 -// CHECK1-NEXT: [[TMP14:%.*]] = load float, ptr [[D2]], align 4 -// CHECK1-NEXT: [[MUL8:%.*]] = fmul float [[TMP13]], [[TMP14]] +// CHECK1-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP1]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load float, ptr [[D2]], align 4 +// CHECK1-NEXT: [[MUL8:%.*]] = fmul float [[TMP12]], [[TMP13]] // CHECK1-NEXT: store float [[MUL8]], ptr [[TMP1]], align 4 // CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) // CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]] @@ -727,29 +727,29 @@ int bar(int n){ // CHECK1-NEXT: store ptr [[A1]], ptr [[TMP6]], align 8 // CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 // CHECK1-NEXT: store ptr [[B2]], ptr [[TMP7]], align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 8 -// CHECK1-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr [[TMP8]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14) -// CHECK1-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1 -// CHECK1-NEXT: br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK1-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer() +// CHECK1-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14) +// CHECK1-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1 +// CHECK1-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] // CHECK1: .omp.reduction.then: -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP0]], align 4 -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[A1]], align 4 -// CHECK1-NEXT: [[OR:%.*]] = or i32 [[TMP11]], [[TMP12]] +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP0]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[A1]], align 4 +// CHECK1-NEXT: [[OR:%.*]] = or i32 [[TMP10]], [[TMP11]] // CHECK1-NEXT: store i32 [[OR]], ptr [[TMP0]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP1]], align 2 -// CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP13]] to i32 -// CHECK1-NEXT: [[TMP14:%.*]] = load i16, ptr [[B2]], align 2 -// CHECK1-NEXT: [[CONV3:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK1-NEXT: [[TMP12:%.*]] = load i16, ptr [[TMP1]], align 2 +// CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP12]] to i32 +// CHECK1-NEXT: [[TMP13:%.*]] = load i16, ptr [[B2]], align 2 +// CHECK1-NEXT: [[CONV3:%.*]] = sext i16 [[TMP13]] to i32 // CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] // CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] // CHECK1: cond.true: -// CHECK1-NEXT: [[TMP15:%.*]] = load i16, ptr [[TMP1]], align 2 +// CHECK1-NEXT: [[TMP14:%.*]] = load i16, ptr [[TMP1]], align 2 // CHECK1-NEXT: br label [[COND_END:%.*]] // CHECK1: cond.false: -// CHECK1-NEXT: [[TMP16:%.*]] = load i16, ptr [[B2]], align 2 +// CHECK1-NEXT: [[TMP15:%.*]] = load i16, ptr [[B2]], align 2 // CHECK1-NEXT: br label [[COND_END]] // CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i16 [ [[TMP15]], [[COND_TRUE]] ], [ [[TMP16]], [[COND_FALSE]] ] +// CHECK1-NEXT: [[COND:%.*]] = phi i16 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] // CHECK1-NEXT: store i16 [[COND]], ptr [[TMP1]], align 2 // CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) // CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]] @@ -1157,13 +1157,13 @@ int bar(int n){ // CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 0 // CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8 -// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP4]], i32 0, i32 0 +// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP5]] // CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4 // CHECK1-NEXT: store i32 [[TMP9]], ptr [[TMP8]], align 128 // CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 1 // CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8 -// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP4]], i32 0, i32 1 +// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1 // CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP5]] // CHECK1-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP11]], align 2 // CHECK1-NEXT: store i16 [[TMP13]], ptr [[TMP12]], align 128 @@ -1183,11 +1183,11 @@ int bar(int n){ // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8 // CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4 // CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP3]], i32 0, i32 0 +// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP4]] // CHECK1-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 8 // CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP3]], i32 0, i32 1 +// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP4]] // CHECK1-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 8 // CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 8 @@ -1209,13 +1209,13 @@ int bar(int n){ // CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 0 // CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8 -// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP4]], i32 0, i32 0 +// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP5]] // CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 128 // CHECK1-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 4 // CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 1 // CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8 -// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP4]], i32 0, i32 1 +// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1 // CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP5]] // CHECK1-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP12]], align 128 // CHECK1-NEXT: store i16 [[TMP13]], ptr [[TMP11]], align 2 @@ -1235,11 +1235,11 @@ int bar(int n){ // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8 // CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4 // CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP3]], i32 0, i32 0 +// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP4]] // CHECK1-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 8 // CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP3]], i32 0, i32 1 +// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP4]] // CHECK1-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 8 // CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 8 @@ -1294,14 +1294,14 @@ int bar(int n){ // CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 // CHECK2-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 // CHECK2-NEXT: store ptr [[E1]], ptr [[TMP4]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 4 -// CHECK2-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr [[TMP5]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func) -// CHECK2-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 1 -// CHECK2-NEXT: br i1 [[TMP7]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK2-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer() +// CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func) +// CHECK2-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 1 +// CHECK2-NEXT: br i1 [[TMP6]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] // CHECK2: .omp.reduction.then: -// CHECK2-NEXT: [[TMP8:%.*]] = load double, ptr [[TMP0]], align 8 -// CHECK2-NEXT: [[TMP9:%.*]] = load double, ptr [[E1]], align 8 -// CHECK2-NEXT: [[ADD2:%.*]] = fadd double [[TMP8]], [[TMP9]] +// CHECK2-NEXT: [[TMP7:%.*]] = load double, ptr [[TMP0]], align 8 +// CHECK2-NEXT: [[TMP8:%.*]] = load double, ptr [[E1]], align 8 +// CHECK2-NEXT: [[ADD2:%.*]] = fadd double [[TMP7]], [[TMP8]] // CHECK2-NEXT: store double [[ADD2]], ptr [[TMP0]], align 8 // CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]]) // CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]] @@ -1583,21 +1583,21 @@ int bar(int n){ // CHECK2-NEXT: store ptr [[C1]], ptr [[TMP6]], align 4 // CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 // CHECK2-NEXT: store ptr [[D2]], ptr [[TMP7]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 4 -// CHECK2-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr [[TMP8]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6) -// CHECK2-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1 -// CHECK2-NEXT: br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK2-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer() +// CHECK2-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6) +// CHECK2-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1 +// CHECK2-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] // CHECK2: .omp.reduction.then: -// CHECK2-NEXT: [[TMP11:%.*]] = load i8, ptr [[TMP0]], align 1 -// CHECK2-NEXT: [[CONV4:%.*]] = sext i8 [[TMP11]] to i32 -// CHECK2-NEXT: [[TMP12:%.*]] = load i8, ptr [[C1]], align 1 -// CHECK2-NEXT: [[CONV5:%.*]] = sext i8 [[TMP12]] to i32 +// CHECK2-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP0]], align 1 +// CHECK2-NEXT: [[CONV4:%.*]] = sext i8 [[TMP10]] to i32 +// CHECK2-NEXT: [[TMP11:%.*]] = load i8, ptr [[C1]], align 1 +// CHECK2-NEXT: [[CONV5:%.*]] = sext i8 [[TMP11]] to i32 // CHECK2-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] // CHECK2-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 // CHECK2-NEXT: store i8 [[CONV7]], ptr [[TMP0]], align 1 -// CHECK2-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP1]], align 4 -// CHECK2-NEXT: [[TMP14:%.*]] = load float, ptr [[D2]], align 4 -// CHECK2-NEXT: [[MUL8:%.*]] = fmul float [[TMP13]], [[TMP14]] +// CHECK2-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP1]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load float, ptr [[D2]], align 4 +// CHECK2-NEXT: [[MUL8:%.*]] = fmul float [[TMP12]], [[TMP13]] // CHECK2-NEXT: store float [[MUL8]], ptr [[TMP1]], align 4 // CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) // CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]] @@ -1924,29 +1924,29 @@ int bar(int n){ // CHECK2-NEXT: store ptr [[A1]], ptr [[TMP6]], align 4 // CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 // CHECK2-NEXT: store ptr [[B2]], ptr [[TMP7]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 4 -// CHECK2-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr [[TMP8]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14) -// CHECK2-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1 -// CHECK2-NEXT: br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK2-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer() +// CHECK2-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14) +// CHECK2-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1 +// CHECK2-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] // CHECK2: .omp.reduction.then: -// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP0]], align 4 -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[A1]], align 4 -// CHECK2-NEXT: [[OR:%.*]] = or i32 [[TMP11]], [[TMP12]] +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP0]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[A1]], align 4 +// CHECK2-NEXT: [[OR:%.*]] = or i32 [[TMP10]], [[TMP11]] // CHECK2-NEXT: store i32 [[OR]], ptr [[TMP0]], align 4 -// CHECK2-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP1]], align 2 -// CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP13]] to i32 -// CHECK2-NEXT: [[TMP14:%.*]] = load i16, ptr [[B2]], align 2 -// CHECK2-NEXT: [[CONV3:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK2-NEXT: [[TMP12:%.*]] = load i16, ptr [[TMP1]], align 2 +// CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP12]] to i32 +// CHECK2-NEXT: [[TMP13:%.*]] = load i16, ptr [[B2]], align 2 +// CHECK2-NEXT: [[CONV3:%.*]] = sext i16 [[TMP13]] to i32 // CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] // CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] // CHECK2: cond.true: -// CHECK2-NEXT: [[TMP15:%.*]] = load i16, ptr [[TMP1]], align 2 +// CHECK2-NEXT: [[TMP14:%.*]] = load i16, ptr [[TMP1]], align 2 // CHECK2-NEXT: br label [[COND_END:%.*]] // CHECK2: cond.false: -// CHECK2-NEXT: [[TMP16:%.*]] = load i16, ptr [[B2]], align 2 +// CHECK2-NEXT: [[TMP15:%.*]] = load i16, ptr [[B2]], align 2 // CHECK2-NEXT: br label [[COND_END]] // CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i16 [ [[TMP15]], [[COND_TRUE]] ], [ [[TMP16]], [[COND_FALSE]] ] +// CHECK2-NEXT: [[COND:%.*]] = phi i16 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] // CHECK2-NEXT: store i16 [[COND]], ptr [[TMP1]], align 2 // CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) // CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]] @@ -2354,13 +2354,13 @@ int bar(int n){ // CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4 // CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 0 // CHECK2-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4 -// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP5]] // CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4 // CHECK2-NEXT: store i32 [[TMP9]], ptr [[TMP8]], align 128 // CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1 // CHECK2-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4 -// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1 // CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP5]] // CHECK2-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP11]], align 2 // CHECK2-NEXT: store i16 [[TMP13]], ptr [[TMP12]], align 128 @@ -2380,11 +2380,11 @@ int bar(int n){ // CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4 // CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4 // CHECK2-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP3]], i32 0, i32 0 +// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0 // CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP4]] // CHECK2-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 4 // CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP3]], i32 0, i32 1 +// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP4]] // CHECK2-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 4 // CHECK2-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 4 @@ -2406,13 +2406,13 @@ int bar(int n){ // CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4 // CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 0 // CHECK2-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4 -// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP5]] // CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 128 // CHECK2-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 4 // CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1 // CHECK2-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4 -// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1 // CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP5]] // CHECK2-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP12]], align 128 // CHECK2-NEXT: store i16 [[TMP13]], ptr [[TMP11]], align 2 @@ -2432,11 +2432,11 @@ int bar(int n){ // CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4 // CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4 // CHECK2-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP3]], i32 0, i32 0 +// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0 // CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP4]] // CHECK2-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 4 // CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP3]], i32 0, i32 1 +// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP4]] // CHECK2-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 4 // CHECK2-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 4 @@ -2491,14 +2491,14 @@ int bar(int n){ // CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 // CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 // CHECK3-NEXT: store ptr [[E1]], ptr [[TMP4]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr [[TMP5]], i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func) -// CHECK3-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 1 -// CHECK3-NEXT: br i1 [[TMP7]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK3-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer() +// CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func) +// CHECK3-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 1 +// CHECK3-NEXT: br i1 [[TMP6]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] // CHECK3: .omp.reduction.then: -// CHECK3-NEXT: [[TMP8:%.*]] = load double, ptr [[TMP0]], align 8 -// CHECK3-NEXT: [[TMP9:%.*]] = load double, ptr [[E1]], align 8 -// CHECK3-NEXT: [[ADD2:%.*]] = fadd double [[TMP8]], [[TMP9]] +// CHECK3-NEXT: [[TMP7:%.*]] = load double, ptr [[TMP0]], align 8 +// CHECK3-NEXT: [[TMP8:%.*]] = load double, ptr [[E1]], align 8 +// CHECK3-NEXT: [[ADD2:%.*]] = fadd double [[TMP7]], [[TMP8]] // CHECK3-NEXT: store double [[ADD2]], ptr [[TMP0]], align 8 // CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]]) // CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]] @@ -2780,21 +2780,21 @@ int bar(int n){ // CHECK3-NEXT: store ptr [[C1]], ptr [[TMP6]], align 4 // CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 // CHECK3-NEXT: store ptr [[D2]], ptr [[TMP7]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr [[TMP8]], i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6) -// CHECK3-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1 -// CHECK3-NEXT: br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK3-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer() +// CHECK3-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6) +// CHECK3-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1 +// CHECK3-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] // CHECK3: .omp.reduction.then: -// CHECK3-NEXT: [[TMP11:%.*]] = load i8, ptr [[TMP0]], align 1 -// CHECK3-NEXT: [[CONV4:%.*]] = sext i8 [[TMP11]] to i32 -// CHECK3-NEXT: [[TMP12:%.*]] = load i8, ptr [[C1]], align 1 -// CHECK3-NEXT: [[CONV5:%.*]] = sext i8 [[TMP12]] to i32 +// CHECK3-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP0]], align 1 +// CHECK3-NEXT: [[CONV4:%.*]] = sext i8 [[TMP10]] to i32 +// CHECK3-NEXT: [[TMP11:%.*]] = load i8, ptr [[C1]], align 1 +// CHECK3-NEXT: [[CONV5:%.*]] = sext i8 [[TMP11]] to i32 // CHECK3-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] // CHECK3-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 // CHECK3-NEXT: store i8 [[CONV7]], ptr [[TMP0]], align 1 -// CHECK3-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP1]], align 4 -// CHECK3-NEXT: [[TMP14:%.*]] = load float, ptr [[D2]], align 4 -// CHECK3-NEXT: [[MUL8:%.*]] = fmul float [[TMP13]], [[TMP14]] +// CHECK3-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP1]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load float, ptr [[D2]], align 4 +// CHECK3-NEXT: [[MUL8:%.*]] = fmul float [[TMP12]], [[TMP13]] // CHECK3-NEXT: store float [[MUL8]], ptr [[TMP1]], align 4 // CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) // CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]] @@ -3121,29 +3121,29 @@ int bar(int n){ // CHECK3-NEXT: store ptr [[A1]], ptr [[TMP6]], align 4 // CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 // CHECK3-NEXT: store ptr [[B2]], ptr [[TMP7]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr [[TMP8]], i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14) -// CHECK3-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1 -// CHECK3-NEXT: br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK3-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer() +// CHECK3-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14) +// CHECK3-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1 +// CHECK3-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] // CHECK3: .omp.reduction.then: -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP0]], align 4 -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[A1]], align 4 -// CHECK3-NEXT: [[OR:%.*]] = or i32 [[TMP11]], [[TMP12]] +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP0]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[A1]], align 4 +// CHECK3-NEXT: [[OR:%.*]] = or i32 [[TMP10]], [[TMP11]] // CHECK3-NEXT: store i32 [[OR]], ptr [[TMP0]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP1]], align 2 -// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP13]] to i32 -// CHECK3-NEXT: [[TMP14:%.*]] = load i16, ptr [[B2]], align 2 -// CHECK3-NEXT: [[CONV3:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK3-NEXT: [[TMP12:%.*]] = load i16, ptr [[TMP1]], align 2 +// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP12]] to i32 +// CHECK3-NEXT: [[TMP13:%.*]] = load i16, ptr [[B2]], align 2 +// CHECK3-NEXT: [[CONV3:%.*]] = sext i16 [[TMP13]] to i32 // CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] // CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] // CHECK3: cond.true: -// CHECK3-NEXT: [[TMP15:%.*]] = load i16, ptr [[TMP1]], align 2 +// CHECK3-NEXT: [[TMP14:%.*]] = load i16, ptr [[TMP1]], align 2 // CHECK3-NEXT: br label [[COND_END:%.*]] // CHECK3: cond.false: -// CHECK3-NEXT: [[TMP16:%.*]] = load i16, ptr [[B2]], align 2 +// CHECK3-NEXT: [[TMP15:%.*]] = load i16, ptr [[B2]], align 2 // CHECK3-NEXT: br label [[COND_END]] // CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i16 [ [[TMP15]], [[COND_TRUE]] ], [ [[TMP16]], [[COND_FALSE]] ] +// CHECK3-NEXT: [[COND:%.*]] = phi i16 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] // CHECK3-NEXT: store i16 [[COND]], ptr [[TMP1]], align 2 // CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) // CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]] @@ -3551,13 +3551,13 @@ int bar(int n){ // CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4 // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 0 // CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4 -// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2048 x i32], ptr [[A]], i32 0, i32 [[TMP5]] // CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4 // CHECK3-NEXT: store i32 [[TMP9]], ptr [[TMP8]], align 128 // CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1 // CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4 -// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1 // CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2048 x i16], ptr [[B]], i32 0, i32 [[TMP5]] // CHECK3-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP11]], align 2 // CHECK3-NEXT: store i16 [[TMP13]], ptr [[TMP12]], align 128 @@ -3577,11 +3577,11 @@ int bar(int n){ // CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4 // CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4 // CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP3]], i32 0, i32 0 +// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0 // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2048 x i32], ptr [[A]], i32 0, i32 [[TMP4]] // CHECK3-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 4 // CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP3]], i32 0, i32 1 +// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2048 x i16], ptr [[B]], i32 0, i32 [[TMP4]] // CHECK3-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 4 // CHECK3-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 4 @@ -3603,13 +3603,13 @@ int bar(int n){ // CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4 // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 0 // CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4 -// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2048 x i32], ptr [[A]], i32 0, i32 [[TMP5]] // CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 128 // CHECK3-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 4 // CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1 // CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4 -// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1 // CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2048 x i16], ptr [[B]], i32 0, i32 [[TMP5]] // CHECK3-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP12]], align 128 // CHECK3-NEXT: store i16 [[TMP13]], ptr [[TMP11]], align 2 @@ -3629,11 +3629,11 @@ int bar(int n){ // CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4 // CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4 // CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP3]], i32 0, i32 0 +// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0 // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2048 x i32], ptr [[A]], i32 0, i32 [[TMP4]] // CHECK3-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 4 // CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP3]], i32 0, i32 1 +// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2048 x i16], ptr [[B]], i32 0, i32 [[TMP4]] // CHECK3-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 4 // CHECK3-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 4 diff --git a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp index 63926bb444081..f9aef3acb1c61 100644 --- a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp +++ b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp @@ -1328,24 +1328,24 @@ int foo() { // IR-GPU-NEXT: [[TMP38:%.*]] = load i32, ptr [[TMP37]], align 4 // IR-GPU-NEXT: [[TMP39:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0 // IR-GPU-NEXT: store ptr [[SUM1_ASCAST]], ptr [[TMP39]], align 8 -// IR-GPU-NEXT: [[TMP40:%.*]] = load ptr, ptr addrspace(1) @"_openmp_teams_reductions_buffer_$_$ptr", align 8 -// IR-GPU-NEXT: [[TMP41:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP38]], ptr [[TMP40]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr @_omp_reduction_shuffle_and_reduce_func.1, ptr @_omp_reduction_inter_warp_copy_func.2, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func) -// IR-GPU-NEXT: [[TMP42:%.*]] = icmp eq i32 [[TMP41]], 1 -// IR-GPU-NEXT: br i1 [[TMP42]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// IR-GPU-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer() +// IR-GPU-NEXT: [[TMP40:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP38]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr @_omp_reduction_shuffle_and_reduce_func.1, ptr @_omp_reduction_inter_warp_copy_func.2, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func) +// IR-GPU-NEXT: [[TMP41:%.*]] = icmp eq i32 [[TMP40]], 1 +// IR-GPU-NEXT: br i1 [[TMP41]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] // IR-GPU: .omp.reduction.then: -// IR-GPU-NEXT: [[TMP43:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100 -// IR-GPU-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP43]] +// IR-GPU-NEXT: [[TMP42:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100 +// IR-GPU-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP42]] // IR-GPU-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE17:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]] // IR-GPU: omp.arraycpy.body: // IR-GPU-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM1_ASCAST]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ] // IR-GPU-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST13:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT15:%.*]], [[OMP_ARRAYCPY_BODY]] ] -// IR-GPU-NEXT: [[TMP44:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4 -// IR-GPU-NEXT: [[TMP45:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4 -// IR-GPU-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] +// IR-GPU-NEXT: [[TMP43:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4 +// IR-GPU-NEXT: [[TMP44:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4 +// IR-GPU-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP43]], [[TMP44]] // IR-GPU-NEXT: store i32 [[ADD14]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4 // IR-GPU-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT15]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], i32 1 // IR-GPU-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1 -// IR-GPU-NEXT: [[OMP_ARRAYCPY_DONE16:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP43]] +// IR-GPU-NEXT: [[OMP_ARRAYCPY_DONE16:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP42]] // IR-GPU-NEXT: br i1 [[OMP_ARRAYCPY_DONE16]], label [[OMP_ARRAYCPY_DONE17]], label [[OMP_ARRAYCPY_BODY]] // IR-GPU: omp.arraycpy.done17: // IR-GPU-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP38]]) diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h index 6ce56475c09b3..a603e91d1182d 100644 --- a/openmp/libomptarget/DeviceRTL/include/Interface.h +++ b/openmp/libomptarget/DeviceRTL/include/Interface.h @@ -234,6 +234,8 @@ void __kmpc_nvptx_end_reduce(int32_t TId); void __kmpc_nvptx_end_reduce_nowait(int32_t TId); +void *__kmpc_reduction_get_fixed_buffer(); + int32_t __kmpc_nvptx_parallel_reduce_nowait_v2( IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size, void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct); diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp index a041d239e1abb..efa09cafa879e 100644 --- a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp @@ -167,9 +167,6 @@ uint32_t roundToWarpsize(uint32_t s) { uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; } -static uint32_t IterCnt = 0; -static uint32_t Cnt = 0; - } // namespace extern "C" { @@ -194,6 +191,9 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2( ThreadId = 0; } + uint32_t &IterCnt = state::getKernelLaunchEnvironment().ReductionIterCnt; + uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt; + // In non-generic mode all workers participate in the teams reduction. // In generic mode only the team master participates in the teams // reduction because the workers are waiting for parallel work. @@ -313,4 +313,8 @@ void __kmpc_nvptx_end_reduce(int32_t TId) {} void __kmpc_nvptx_end_reduce_nowait(int32_t TId) {} } +void *__kmpc_reduction_get_fixed_buffer() { + return state::getKernelLaunchEnvironment().ReductionBuffer; +} + #pragma omp end declare target diff --git a/openmp/libomptarget/include/Environment.h b/openmp/libomptarget/include/Environment.h index 9c02e2390581d..bd493e8a0be78 100644 --- a/openmp/libomptarget/include/Environment.h +++ b/openmp/libomptarget/include/Environment.h @@ -77,15 +77,16 @@ struct DynamicEnvironmentTy { // NOTE: Please don't change the order of those members as their indices are // used in the middle end. Always add the new data member at the end. struct ConfigurationEnvironmentTy { - uint8_t UseGenericStateMachine; - uint8_t MayUseNestedParallelism; - llvm::omp::OMPTgtExecModeFlags ExecMode; + uint8_t UseGenericStateMachine = 2; + uint8_t MayUseNestedParallelism = 2; + llvm::omp::OMPTgtExecModeFlags ExecMode = llvm::omp::OMP_TGT_EXEC_MODE_SPMD; // Information about (legal) launch configurations. //{ - int32_t MinThreads; - int32_t MaxThreads; - int32_t MinTeams; - int32_t MaxTeams; + int32_t MinThreads = -1; + int32_t MaxThreads = -1; + int32_t MinTeams = -1; + int32_t MaxTeams = -1; + int32_t ReductionBufferSize = 0; //} }; @@ -93,10 +94,14 @@ struct ConfigurationEnvironmentTy { // used in the middle end. Always add the new data member at the end. struct KernelEnvironmentTy { ConfigurationEnvironmentTy Configuration; - IdentTy *Ident; - DynamicEnvironmentTy *DynamicEnv; + IdentTy *Ident = nullptr; + DynamicEnvironmentTy *DynamicEnv = nullptr; }; -struct KernelLaunchEnvironmentTy {}; +struct KernelLaunchEnvironmentTy { + uint32_t ReductionCnt = 0; + uint32_t ReductionIterCnt = 0; + void *ReductionBuffer = nullptr; +}; #endif // _OMPTARGET_ENVIRONMENT_H_ diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp index 69943486aa720..8747502065a59 100644 --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp @@ -402,9 +402,8 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice, DP("Failed to read kernel environment for '%s': %s\n" "Using default SPMD (2) execution mode\n", Name, ErrStr.data()); - KernelEnvironment.Configuration.ExecMode = OMP_TGT_EXEC_MODE_SPMD; - KernelEnvironment.Configuration.MayUseNestedParallelism = /*Unknown=*/2; - KernelEnvironment.Configuration.UseGenericStateMachine = /*Unknown=*/2; + assert(KernelEnvironment.Configuration.ReductionBufferSize == 0 && + "Default initialization failed."); } // Max = Config.Max > 0 ? min(Config.Max, Device.Max) : Device.Max; @@ -441,6 +440,17 @@ GenericKernelTy::getKernelLaunchEnvironment( /// async data transfer. auto &LocalKLE = (*AsyncInfoWrapper).KernelLaunchEnvironment; LocalKLE = KernelLaunchEnvironment; + if (KernelEnvironment.Configuration.ReductionBufferSize) { + auto AllocOrErr = GenericDevice.dataAlloc( + KernelEnvironment.Configuration.ReductionBufferSize, + /*HostPtr=*/nullptr, TargetAllocTy::TARGET_ALLOC_DEVICE); + if (!AllocOrErr) + return AllocOrErr.takeError(); + LocalKLE.ReductionBuffer = *AllocOrErr; + // Remember to free the memory later. + AsyncInfoWrapper.freeAllocationAfterSynchronization(*AllocOrErr); + } + auto Err = GenericDevice.dataSubmit(*AllocOrErr, &LocalKLE, sizeof(KernelLaunchEnvironmentTy), AsyncInfoWrapper); diff --git a/openmp/libomptarget/test/offloading/parallel_target_teams_reduction.cpp b/openmp/libomptarget/test/offloading/parallel_target_teams_reduction.cpp new file mode 100644 index 0000000000000..5303a9463f155 --- /dev/null +++ b/openmp/libomptarget/test/offloading/parallel_target_teams_reduction.cpp @@ -0,0 +1,36 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic +// RUN: %libomptarget-compileoptxx-run-and-check-generic + +// FIXME: This is a bug in host offload, this should run fine. +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +#include +#include + +#define N 8 + +int main() { + std::vector avec(N); + int *a = avec.data(); +#pragma omp parallel for + for (int i = 0; i < N; i++) { + a[i] = 0; +#pragma omp target teams distribute parallel for reduction(+ : a[i]) + for (int j = 0; j < N; j++) + a[i] += 1; + } + + // CHECK: 8 + // CHECK: 8 + // CHECK: 8 + // CHECK: 8 + // CHECK: 8 + // CHECK: 8 + // CHECK: 8 + // CHECK: 8 + for (int i = 0; i < N; i++) + std::cout << a[i] << std::endl; +}