-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[profcheck] Fix missing profile metadata in ExpandMemCmp #169979
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[profcheck] Fix missing profile metadata in ExpandMemCmp #169979
Conversation
|
@llvm/pr-subscribers-llvm-transforms Author: Jin Huang (jinhuang1102) ChangesThis patch fixes a profile metadata missing in the The patch updates the The patch also includes updates to the tests to reflect the new branch weights. Full diff: https://github.com/llvm/llvm-project/pull/169979.diff 4 Files Affected:
diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp
index 74f93e1979532..61ab0cb869904 100644
--- a/llvm/lib/CodeGen/ExpandMemCmp.cpp
+++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp
@@ -25,6 +25,7 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/InitializePasses.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -62,7 +63,6 @@ static cl::opt<unsigned> MaxLoadsPerMemcmpOptSize(
namespace {
-
// This class provides helper functions to expand a memcmp library call into an
// inline expansion.
class MemCmpExpansion {
@@ -92,8 +92,7 @@ class MemCmpExpansion {
// 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {1, 32}.
struct LoadEntry {
LoadEntry(unsigned LoadSize, uint64_t Offset)
- : LoadSize(LoadSize), Offset(Offset) {
- }
+ : LoadSize(LoadSize), Offset(Offset) {}
// The size of the load for this block, in bytes.
unsigned LoadSize;
@@ -488,6 +487,8 @@ void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(unsigned BlockIndex,
// continue to next LoadCmpBlock or EndBlock.
BasicBlock *BB = Builder.GetInsertBlock();
BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp);
+ setExplicitlyUnknownBranchWeightsIfProfiled(*CmpBr, DEBUG_TYPE,
+ CI->getFunction());
Builder.Insert(CmpBr);
if (DTU)
DTU->applyUpdates({{DominatorTree::Insert, BB, ResBlock.BB},
@@ -552,6 +553,8 @@ void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) {
// to next LoadCmpBlock or EndBlock.
BasicBlock *BB = Builder.GetInsertBlock();
BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp);
+ setExplicitlyUnknownBranchWeightsIfProfiled(*CmpBr, DEBUG_TYPE,
+ CI->getFunction());
Builder.Insert(CmpBr);
if (DTU)
DTU->applyUpdates({{DominatorTree::Insert, BB, NextBB},
@@ -592,6 +595,8 @@ void MemCmpExpansion::emitMemCmpResultBlock() {
Value *Res =
Builder.CreateSelect(Cmp, Constant::getAllOnesValue(Builder.getInt32Ty()),
ConstantInt::get(Builder.getInt32Ty(), 1));
+ setExplicitlyUnknownBranchWeightsIfProfiled(*cast<Instruction>(Res),
+ DEBUG_TYPE, CI->getFunction());
PhiRes->addIncoming(Res, ResBlock.BB);
BranchInst *NewBr = BranchInst::Create(EndBlock);
@@ -724,7 +729,8 @@ Value *MemCmpExpansion::getMemCmpExpansion() {
// calculate which source was larger. The calculation requires the
// two loaded source values of each load compare block.
// These will be saved in the phi nodes created by setupResultBlockPHINodes.
- if (!IsUsedForZeroCmp) setupResultBlockPHINodes();
+ if (!IsUsedForZeroCmp)
+ setupResultBlockPHINodes();
// Create the number of required load compare basic blocks.
createLoadCmpBlocks();
@@ -853,15 +859,14 @@ static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
const bool IsUsedForZeroCmp =
IsBCmp || isOnlyUsedInZeroEqualityComparison(CI);
bool OptForSize = llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI);
- auto Options = TTI->enableMemCmpExpansion(OptForSize,
- IsUsedForZeroCmp);
- if (!Options) return false;
+ auto Options = TTI->enableMemCmpExpansion(OptForSize, IsUsedForZeroCmp);
+ if (!Options)
+ return false;
if (MemCmpEqZeroNumLoadsPerBlock.getNumOccurrences())
Options.NumLoadsPerBlock = MemCmpEqZeroNumLoadsPerBlock;
- if (OptForSize &&
- MaxLoadsPerMemcmpOptSize.getNumOccurrences())
+ if (OptForSize && MaxLoadsPerMemcmpOptSize.getNumOccurrences())
Options.MaxNumLoads = MaxLoadsPerMemcmpOptSize;
if (!OptForSize && MaxLoadsPerMemcmp.getNumOccurrences())
@@ -907,13 +912,14 @@ class ExpandMemCmpLegacyPass : public FunctionPass {
}
bool runOnFunction(Function &F) override {
- if (skipFunction(F)) return false;
+ if (skipFunction(F))
+ return false;
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
if (!TPC) {
return false;
}
- const TargetLowering* TL =
+ const TargetLowering *TL =
TPC->getTM<TargetMachine>().getSubtargetImpl(F)->getTargetLowering();
const TargetLibraryInfo *TLI =
@@ -921,9 +927,9 @@ class ExpandMemCmpLegacyPass : public FunctionPass {
const TargetTransformInfo *TTI =
&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
- auto *BFI = (PSI && PSI->hasProfileSummary()) ?
- &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() :
- nullptr;
+ auto *BFI = (PSI && PSI->hasProfileSummary())
+ ? &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI()
+ : nullptr;
DominatorTree *DT = nullptr;
if (auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>())
DT = &DTWP->getDomTree();
@@ -969,7 +975,7 @@ PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,
if (DT)
DTU.emplace(DT, DomTreeUpdater::UpdateStrategy::Lazy);
- const DataLayout& DL = F.getDataLayout();
+ const DataLayout &DL = F.getDataLayout();
bool MadeChanges = false;
for (auto BBIt = F.begin(); BBIt != F.end();) {
if (runOnBlock(*BBIt, TLI, TTI, TL, DL, PSI, BFI, DTU ? &*DTU : nullptr)) {
diff --git a/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll
index 7df48d878bd0b..0ed3af535ac9c 100644
--- a/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --version 3
; RUN: opt -S -expand-memcmp -memcmp-num-loads-per-block=1 -mtriple=aarch64-unknown-unknown < %s | FileCheck %s
; RUN: opt -S -passes=expand-memcmp -memcmp-num-loads-per-block=1 -mtriple=aarch64-unknown-unknown < %s | FileCheck %s
@@ -98,15 +98,15 @@ define i32 @cmp6(ptr nocapture readonly %x, ptr nocapture readonly %y) {
ret i32 %call
}
-define i32 @cmp7(ptr nocapture readonly %x, ptr nocapture readonly %y) {
+define i32 @cmp7(ptr nocapture readonly %x, ptr nocapture readonly %y) !prof !0 {
; CHECK-LABEL: define i32 @cmp7(
-; CHECK-SAME: ptr readonly captures(none) [[X:%.*]], ptr readonly captures(none) [[Y:%.*]]) {
+; CHECK-SAME: ptr readonly captures(none) [[X:%.*]], ptr readonly captures(none) [[Y:%.*]]) !prof [[PROF0:![0-9]+]] {
; CHECK-NEXT: br label [[LOADBB:%.*]]
; CHECK: res_block:
; CHECK-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
; CHECK-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
-; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1, !prof [[PROF1:![0-9]+]]
; CHECK-NEXT: br label [[ENDBLOCK:%.*]]
; CHECK: loadbb:
; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[X]], align 1
@@ -114,7 +114,7 @@ define i32 @cmp7(ptr nocapture readonly %x, ptr nocapture readonly %y) {
; CHECK-NEXT: [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
; CHECK-NEXT: [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
-; CHECK-NEXT: br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK-NEXT: br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]], !prof [[PROF1]]
; CHECK: loadbb1:
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
@@ -123,7 +123,7 @@ define i32 @cmp7(ptr nocapture readonly %x, ptr nocapture readonly %y) {
; CHECK-NEXT: [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
; CHECK-NEXT: [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
-; CHECK-NEXT: br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK-NEXT: br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]], !prof [[PROF1]]
; CHECK: endblock:
; CHECK-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
; CHECK-NEXT: ret i32 [[PHI_RES]]
@@ -860,3 +860,11 @@ define i32 @cmp_eq16(ptr nocapture readonly %x, ptr nocapture readonly %y) {
%conv = zext i1 %cmp to i32
ret i32 %conv
}
+
+!0 = !{!"function_entry_count", i64 1000}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
+;.
+; CHECK: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF1]] = !{!"unknown", !"expand-memcmp"}
+;.
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
index 0507ec9de542e..30da8b391f55d 100644
--- a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
; RUN: opt -S -expand-memcmp -mtriple=i686-unknown-unknown -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=X32
; RUN: opt -S -passes=expand-memcmp -mtriple=i686-unknown-unknown -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=X32
@@ -34,12 +34,12 @@ define i32 @cmp2_align2(ptr nocapture readonly align 2 %x, ptr nocapture readonl
ret i32 %call
}
-define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y) {
+define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y) !prof !0 {
; X32-LABEL: @cmp3(
; X32-NEXT: br label [[LOADBB:%.*]]
; X32: res_block:
; X32-NEXT: [[TMP1:%.*]] = icmp ult i16 [[TMP7:%.*]], [[TMP8:%.*]]
-; X32-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X32-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1, !prof [[PROF1:![0-9]+]]
; X32-NEXT: br label [[ENDBLOCK:%.*]]
; X32: loadbb:
; X32-NEXT: [[TMP5:%.*]] = load i16, ptr [[X:%.*]], align 1
@@ -47,7 +47,7 @@ define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y) {
; X32-NEXT: [[TMP7]] = call i16 @llvm.bswap.i16(i16 [[TMP5]])
; X32-NEXT: [[TMP8]] = call i16 @llvm.bswap.i16(i16 [[TMP6]])
; X32-NEXT: [[TMP9:%.*]] = icmp eq i16 [[TMP7]], [[TMP8]]
-; X32-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]], !prof [[PROF1]]
; X32: loadbb1:
; X32-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 2
; X32-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 2
@@ -564,3 +564,10 @@ define i32 @cmp_eq16(ptr nocapture readonly %x, ptr nocapture readonly %y) {
ret i32 %conv
}
+!0 = !{!"function_entry_count", i64 1000}
+;.
+; X32: attributes #[[ATTR0:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
+;.
+; X32: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
+; X32: [[PROF1]] = !{!"unknown", !"expand-memcmp"}
+;.
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
index 86dc3e5245f24..b5075a2fcff0b 100644
--- a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
; RUN: opt -S -expand-memcmp -memcmp-num-loads-per-block=1 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64_1LD
; RUN: opt -S -expand-memcmp -memcmp-num-loads-per-block=2 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64_2LD
; RUN: opt -S -passes=expand-memcmp -memcmp-num-loads-per-block=1 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64_1LD
@@ -36,12 +36,12 @@ define i32 @cmp2_align2(ptr nocapture readonly align 2 %x, ptr nocapture readonl
ret i32 %call
}
-define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y) {
+define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y) !prof !0 {
; X64-LABEL: @cmp3(
; X64-NEXT: br label [[LOADBB:%.*]]
; X64: res_block:
; X64-NEXT: [[TMP1:%.*]] = icmp ult i16 [[TMP7:%.*]], [[TMP8:%.*]]
-; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1, !prof [[PROF1:![0-9]+]]
; X64-NEXT: br label [[ENDBLOCK:%.*]]
; X64: loadbb:
; X64-NEXT: [[TMP5:%.*]] = load i16, ptr [[X:%.*]], align 1
@@ -49,7 +49,7 @@ define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y) {
; X64-NEXT: [[TMP7]] = call i16 @llvm.bswap.i16(i16 [[TMP5]])
; X64-NEXT: [[TMP8]] = call i16 @llvm.bswap.i16(i16 [[TMP6]])
; X64-NEXT: [[TMP9:%.*]] = icmp eq i16 [[TMP7]], [[TMP8]]
-; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]], !prof [[PROF1]]
; X64: loadbb1:
; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 2
; X64-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 2
@@ -474,7 +474,7 @@ define i32 @cmp_eq2(ptr nocapture readonly %x, ptr nocapture readonly %y) {
ret i32 %conv
}
-define i32 @cmp_eq3(ptr nocapture readonly %x, ptr nocapture readonly %y) {
+define i32 @cmp_eq3(ptr nocapture readonly %x, ptr nocapture readonly %y) !prof !0 {
; X64_1LD-LABEL: @cmp_eq3(
; X64_1LD-NEXT: br label [[LOADBB:%.*]]
; X64_1LD: res_block:
@@ -483,14 +483,14 @@ define i32 @cmp_eq3(ptr nocapture readonly %x, ptr nocapture readonly %y) {
; X64_1LD-NEXT: [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 1
; X64_1LD-NEXT: [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 1
; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]], !prof [[PROF1]]
; X64_1LD: loadbb1:
; X64_1LD-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 2
; X64_1LD-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 2
; X64_1LD-NEXT: [[TMP8:%.*]] = load i8, ptr [[TMP6]], align 1
; X64_1LD-NEXT: [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
; X64_1LD-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]]
-; X64_1LD-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]], !prof [[PROF1]]
; X64_1LD: endblock:
; X64_1LD-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
; X64_1LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -1076,3 +1076,15 @@ define i32 @cmp_eq16(ptr nocapture readonly %x, ptr nocapture readonly %y) {
ret i32 %conv
}
+!0 = !{!"function_entry_count", i64 1000}
+;.
+; X64_1LD: attributes #[[ATTR0:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
+;.
+; X64_2LD: attributes #[[ATTR0:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
+;.
+; X64_1LD: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
+; X64_1LD: [[PROF1]] = !{!"unknown", !"expand-memcmp"}
+;.
+; X64_2LD: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
+; X64_2LD: [[PROF1]] = !{!"unknown", !"expand-memcmp"}
+;.
|
This patch fixes a profile metadata missing in the
ExpandMemCmppass when it expandingmemcmpcalls. This would cause branches between different blocks to lose their profile data, potentially leading to suboptimal code generation.The patch updates the
ExpandMemCmppass to set branch weights to a defaultunknown(50/50 weights) value when a profile is available. This prevents the expansion from making a previously profiled branch unprofiled.The patch also includes updates to the tests to reflect the new branch weights.