diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp index 7b927e6ec9b81..bfc49dd354aa6 100644 --- a/llvm/lib/CodeGen/SelectOptimize.cpp +++ b/llvm/lib/CodeGen/SelectOptimize.cpp @@ -1044,6 +1044,18 @@ bool SelectOptimizeImpl::isConvertToBranchProfitableBase( return true; } + // If latch has a select group with several elements, it is usually profitable + // to convert it to branches. We let `optimizeSelectsInnerLoops` decide if + // conversion is profitable for innermost loops. + auto *BB = SI.getI()->getParent(); + auto *L = LI->getLoopFor(BB); + if (L && !L->isInnermost() && L->getLoopLatch() == BB && + ASI.Selects.size() >= 3) { + OR << "Converted to branch because select group in the latch block is big."; + EmitAndPrintRemark(ORE, OR); + return true; + } + ORmiss << "Not profitable to convert to branch (base heuristic)."; EmitAndPrintRemark(ORE, ORmiss); return false; diff --git a/llvm/test/CodeGen/AArch64/selectopt.ll b/llvm/test/CodeGen/AArch64/selectopt.ll index 54309dca3b834..d72a956e08d0c 100644 --- a/llvm/test/CodeGen/AArch64/selectopt.ll +++ b/llvm/test/CodeGen/AArch64/selectopt.ll @@ -875,3 +875,124 @@ if.end: %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count br i1 %exitcond.not, label %for.cond.cleanup, label %for.body } + +declare i64 @payload(i64, ptr, ptr, i64) + +define void @outer_latch_heuristic(ptr %dst, ptr %src, i64 %p, i64 %dim) { +; CHECKOO-LABEL: @outer_latch_heuristic( +; CHECKOO-NEXT: entry: +; CHECKOO-NEXT: br label [[OUTER_LOOP:%.*]] +; CHECKOO: outer.loop: +; CHECKOO-NEXT: [[K_020_US:%.*]] = phi i64 [ [[INC7_US:%.*]], [[SELECT_END:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECKOO-NEXT: [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[SELECT_END]] ], [ 0, [[ENTRY]] ] +; CHECKOO-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[SELECT_END]] ], [ 0, [[ENTRY]] ] +; CHECKOO-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC:%.*]], i64 [[I]] +; CHECKOO-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8 +; CHECKOO-NEXT: [[ARRAYIDX1_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[J]] +; CHECKOO-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX1_US]], align 8 +; CHECKOO-NEXT: br label [[INNER_LOOP:%.*]] +; CHECKOO: inner.loop: +; CHECKOO-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[DIM:%.*]], [[OUTER_LOOP]] ], [ [[LSR_IV_NEXT:%.*]], [[INNER_LOOP]] ] +; CHECKOO-NEXT: [[DIFF_04_I_US:%.*]] = phi i64 [ [[CALL_I_US:%.*]], [[INNER_LOOP]] ], [ 0, [[OUTER_LOOP]] ] +; CHECKOO-NEXT: [[CALL_I_US]] = tail call i64 @payload(i64 [[DIFF_04_I_US]], ptr [[TMP0]], ptr [[TMP1]], i64 [[P:%.*]]) +; CHECKOO-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1 +; CHECKOO-NEXT: [[EXITCOND_NOT_I_US:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0 +; CHECKOO-NEXT: br i1 [[EXITCOND_NOT_I_US]], label [[LATCH:%.*]], label [[INNER_LOOP]] +; CHECKOO: latch: +; CHECKOO-NEXT: [[CMP2_US:%.*]] = icmp sgt i64 [[CALL_I_US]], -1 +; CHECKOO-NEXT: [[DIFF_0_LCSSA_I_LOBIT_US:%.*]] = lshr i64 [[CALL_I_US]], 63 +; CHECKOO-NEXT: [[CMP2_US_FROZEN:%.*]] = freeze i1 [[CMP2_US]] +; CHECKOO-NEXT: br i1 [[CMP2_US_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_FALSE_SINK:%.*]] +; CHECKOO: select.true.sink: +; CHECKOO-NEXT: [[TMP2:%.*]] = add nsw i64 [[J]], 1 +; CHECKOO-NEXT: br label [[SELECT_END]] +; CHECKOO: select.false.sink: +; CHECKOO-NEXT: [[TMP3:%.*]] = add nsw i64 1, [[I]] +; CHECKOO-NEXT: br label [[SELECT_END]] +; CHECKOO: select.end: +; CHECKOO-NEXT: [[I_NEXT]] = phi i64 [ [[I]], [[SELECT_TRUE_SINK]] ], [ [[TMP3]], [[SELECT_FALSE_SINK]] ] +; CHECKOO-NEXT: [[J_NEXT]] = phi i64 [ [[TMP2]], [[SELECT_TRUE_SINK]] ], [ [[J]], [[SELECT_FALSE_SINK]] ] +; CHECKOO-NEXT: [[COND_IN_US:%.*]] = phi ptr [ [[ARRAYIDX1_US]], [[SELECT_TRUE_SINK]] ], [ [[ARRAYIDX_US]], [[SELECT_FALSE_SINK]] ] +; CHECKOO-NEXT: [[INC4_US:%.*]] = zext i1 [[CMP2_US]] to i64 +; CHECKOO-NEXT: [[COND_US:%.*]] = load ptr, ptr [[COND_IN_US]], align 8 +; CHECKOO-NEXT: [[ARRAYIDX6_US:%.*]] = getelementptr inbounds ptr, ptr [[DST:%.*]], i64 [[K_020_US]] +; CHECKOO-NEXT: store ptr [[COND_US]], ptr [[ARRAYIDX6_US]], align 8 +; CHECKOO-NEXT: [[INC7_US]] = add i64 [[K_020_US]], 1 +; CHECKOO-NEXT: [[EXITCOND23_NOT:%.*]] = icmp eq i64 [[K_020_US]], 1000 +; CHECKOO-NEXT: br i1 [[EXITCOND23_NOT]], label [[EXIT:%.*]], label [[OUTER_LOOP]] +; CHECKOO: exit: +; CHECKOO-NEXT: ret void +; +; CHECKII-LABEL: @outer_latch_heuristic( +; CHECKII-NEXT: entry: +; CHECKII-NEXT: br label [[OUTER_LOOP:%.*]] +; CHECKII: outer.loop: +; CHECKII-NEXT: [[K_020_US:%.*]] = phi i64 [ [[INC7_US:%.*]], [[LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECKII-NEXT: [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[LATCH]] ], [ 0, [[ENTRY]] ] +; CHECKII-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH]] ], [ 0, [[ENTRY]] ] +; CHECKII-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC:%.*]], i64 [[I]] +; CHECKII-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8 +; CHECKII-NEXT: [[ARRAYIDX1_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[J]] +; CHECKII-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX1_US]], align 8 +; CHECKII-NEXT: br label [[INNER_LOOP:%.*]] +; CHECKII: inner.loop: +; CHECKII-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[DIM:%.*]], [[OUTER_LOOP]] ], [ [[LSR_IV_NEXT:%.*]], [[INNER_LOOP]] ] +; CHECKII-NEXT: [[DIFF_04_I_US:%.*]] = phi i64 [ [[CALL_I_US:%.*]], [[INNER_LOOP]] ], [ 0, [[OUTER_LOOP]] ] +; CHECKII-NEXT: [[CALL_I_US]] = tail call i64 @payload(i64 [[DIFF_04_I_US]], ptr [[TMP0]], ptr [[TMP1]], i64 [[P:%.*]]) +; CHECKII-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1 +; CHECKII-NEXT: [[EXITCOND_NOT_I_US:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0 +; CHECKII-NEXT: br i1 [[EXITCOND_NOT_I_US]], label [[LATCH]], label [[INNER_LOOP]] +; CHECKII: latch: +; CHECKII-NEXT: [[CMP2_US:%.*]] = icmp sgt i64 [[CALL_I_US]], -1 +; CHECKII-NEXT: [[DIFF_0_LCSSA_I_LOBIT_US:%.*]] = lshr i64 [[CALL_I_US]], 63 +; CHECKII-NEXT: [[I_NEXT]] = add nsw i64 [[DIFF_0_LCSSA_I_LOBIT_US]], [[I]] +; CHECKII-NEXT: [[INC4_US:%.*]] = zext i1 [[CMP2_US]] to i64 +; CHECKII-NEXT: [[J_NEXT]] = add nsw i64 [[J]], [[INC4_US]] +; CHECKII-NEXT: [[COND_IN_US:%.*]] = select i1 [[CMP2_US]], ptr [[ARRAYIDX1_US]], ptr [[ARRAYIDX_US]] +; CHECKII-NEXT: [[COND_US:%.*]] = load ptr, ptr [[COND_IN_US]], align 8 +; CHECKII-NEXT: [[ARRAYIDX6_US:%.*]] = getelementptr inbounds ptr, ptr [[DST:%.*]], i64 [[K_020_US]] +; CHECKII-NEXT: store ptr [[COND_US]], ptr [[ARRAYIDX6_US]], align 8 +; CHECKII-NEXT: [[INC7_US]] = add i64 [[K_020_US]], 1 +; CHECKII-NEXT: [[EXITCOND23_NOT:%.*]] = icmp eq i64 [[K_020_US]], 1000 +; CHECKII-NEXT: br i1 [[EXITCOND23_NOT]], label [[EXIT:%.*]], label [[OUTER_LOOP]] +; CHECKII: exit: +; CHECKII-NEXT: ret void +; +entry: + br label %outer.loop + +outer.loop: + %k.020.us = phi i64 [ %inc7.us, %latch ], [ 0, %entry ] + %j = phi i64 [ %j.next, %latch ], [ 0, %entry ] + %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] + %arrayidx.us = getelementptr inbounds ptr, ptr %src, i64 %i + %4 = load ptr, ptr %arrayidx.us, align 8 + %arrayidx1.us = getelementptr inbounds ptr, ptr %src, i64 %j + %5 = load ptr, ptr %arrayidx1.us, align 8 + br label %inner.loop + +inner.loop: + %lsr.iv = phi i64 [ %dim, %outer.loop ], [ %lsr.iv.next, %inner.loop ] + %diff.04.i.us = phi i64 [ %call.i.us, %inner.loop ], [ 0, %outer.loop ] + %call.i.us = tail call i64 @payload(i64 %diff.04.i.us, ptr %4, ptr %5, i64 %p) + %lsr.iv.next = add i64 %lsr.iv, -1 + %exitcond.not.i.us = icmp eq i64 %lsr.iv.next, 0 + br i1 %exitcond.not.i.us, label %latch, label %inner.loop + +latch: + %cmp2.us = icmp sgt i64 %call.i.us, -1 + %diff.0.lcssa.i.lobit.us = lshr i64 %call.i.us, 63 + %i.next = add nsw i64 %diff.0.lcssa.i.lobit.us, %i + %inc4.us = zext i1 %cmp2.us to i64 + %j.next = add nsw i64 %j, %inc4.us + %cond.in.us = select i1 %cmp2.us, ptr %arrayidx1.us, ptr %arrayidx.us + %cond.us = load ptr, ptr %cond.in.us, align 8 + %arrayidx6.us = getelementptr inbounds ptr, ptr %dst, i64 %k.020.us + store ptr %cond.us, ptr %arrayidx6.us, align 8 + %inc7.us = add i64 %k.020.us, 1 + %exitcond23.not = icmp eq i64 %k.020.us, 1000 + br i1 %exitcond23.not, label %exit, label %outer.loop + +exit: + ret void +}