Skip to content

Commit 24b87b8

Browse files
committed
[VPlan] Skip cost verification for loops with EVL gather/scatter.
The VPlan-based cost model use vp_gather/vp_scatter for gather/scatter costs, which is different to the legacy cost model and cannot be matched there. Don't verify the costs match for plans containing gather/scatters with EVL. Fixes #169948.
1 parent 9ffd2e4 commit 24b87b8

File tree

2 files changed

+139
-11
lines changed

2 files changed

+139
-11
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7187,17 +7187,29 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
71877187
VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind,
71887188
*CM.PSE.getSE(), OrigLoop);
71897189
precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7190-
// Verify that the VPlan-based and legacy cost models agree, except for VPlans
7191-
// with early exits and plans with additional VPlan simplifications. The
7192-
// legacy cost model doesn't properly model costs for such loops.
7193-
assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
7194-
!Legal->getLAI()->getSymbolicStrides().empty() ||
7195-
planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
7196-
CostCtx, OrigLoop,
7197-
BestFactor.Width) ||
7198-
planContainsAdditionalSimplifications(
7199-
getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) &&
7200-
" VPlan cost model and legacy cost model disagreed");
7190+
// Verify that the VPlan-based and legacy cost models agree, except for
7191+
// * VPlans with early exits,
7192+
// * VPlans with additional VPlan simplifications,
7193+
// * EVL-based VPlans with gather/scatters (the VPlan-based cost model uses
7194+
// vp_scatter/vp_gather).
7195+
// The legacy cost model doesn't properly model costs for such loops.
7196+
bool UsesEVLGatherScatter =
7197+
any_of(VPBlockUtils::blocksOnly<VPBasicBlock>(vp_depth_first_shallow(
7198+
BestPlan.getVectorLoopRegion()->getEntry())),
7199+
[](VPBasicBlock *VPBB) {
7200+
return any_of(*VPBB, [](VPRecipeBase &R) {
7201+
return isa<VPWidenLoadEVLRecipe, VPWidenStoreEVLRecipe>(&R) &&
7202+
!cast<VPWidenMemoryRecipe>(&R)->isConsecutive();
7203+
});
7204+
});
7205+
assert(
7206+
(BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
7207+
!Legal->getLAI()->getSymbolicStrides().empty() || UsesEVLGatherScatter ||
7208+
planContainsAdditionalSimplifications(
7209+
getPlanFor(BestFactor.Width), CostCtx, OrigLoop, BestFactor.Width) ||
7210+
planContainsAdditionalSimplifications(
7211+
getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) &&
7212+
" VPlan cost model and legacy cost model disagreed");
72017213
assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
72027214
"when vectorizing, the scalar cost must be computed.");
72037215
#endif

llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,3 +219,119 @@ loop:
219219
exit:
220220
ret void
221221
}
222+
223+
; Test for https://github.com/llvm/llvm-project/issues/169948.
224+
define i8 @mixed_gather_scatters(ptr %A, ptr %B, ptr %C) #0 {
225+
; RVA23-LABEL: @mixed_gather_scatters(
226+
; RVA23-NEXT: entry:
227+
; RVA23-NEXT: br label [[VECTOR_PH:%.*]]
228+
; RVA23: vector.ph:
229+
; RVA23-NEXT: br label [[VECTOR_BODY:%.*]]
230+
; RVA23: vector.body:
231+
; RVA23-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
232+
; RVA23-NEXT: [[AVL:%.*]] = phi i32 [ 10, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
233+
; RVA23-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 2, i1 true)
234+
; RVA23-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A:%.*]], align 8
235+
; RVA23-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[TMP1]], i64 0
236+
; RVA23-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
237+
; RVA23-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.vp.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 [[BROADCAST_SPLAT]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP0]])
238+
; RVA23-NEXT: [[TMP2:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_MASKED_GATHER]], zeroinitializer
239+
; RVA23-NEXT: [[TMP3:%.*]] = zext <vscale x 2 x i1> [[TMP2]] to <vscale x 2 x i8>
240+
; RVA23-NEXT: [[TMP4:%.*]] = or <vscale x 2 x i8> [[VEC_PHI]], [[TMP3]]
241+
; RVA23-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B:%.*]], align 8
242+
; RVA23-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[TMP5]], i64 0
243+
; RVA23-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
244+
; RVA23-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 2 x i64> @llvm.vp.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 [[BROADCAST_SPLAT2]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP0]])
245+
; RVA23-NEXT: [[TMP6:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_MASKED_GATHER3]], zeroinitializer
246+
; RVA23-NEXT: [[TMP7:%.*]] = zext <vscale x 2 x i1> [[TMP6]] to <vscale x 2 x i8>
247+
; RVA23-NEXT: [[TMP8:%.*]] = or <vscale x 2 x i8> [[TMP4]], [[TMP7]]
248+
; RVA23-NEXT: [[TMP9:%.*]] = or <vscale x 2 x i8> [[TMP8]], splat (i8 1)
249+
; RVA23-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C:%.*]], align 8
250+
; RVA23-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[TMP10]], i64 0
251+
; RVA23-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT4]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
252+
; RVA23-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x i64> @llvm.vp.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 [[BROADCAST_SPLAT5]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP0]])
253+
; RVA23-NEXT: [[TMP11:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_MASKED_GATHER6]], zeroinitializer
254+
; RVA23-NEXT: [[TMP12:%.*]] = zext <vscale x 2 x i1> [[TMP11]] to <vscale x 2 x i8>
255+
; RVA23-NEXT: [[TMP13:%.*]] = or <vscale x 2 x i8> [[TMP9]], [[TMP12]]
256+
; RVA23-NEXT: [[TMP14]] = call <vscale x 2 x i8> @llvm.vp.merge.nxv2i8(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> [[TMP13]], <vscale x 2 x i8> [[VEC_PHI]], i32 [[TMP0]])
257+
; RVA23-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP0]]
258+
; RVA23-NEXT: [[TMP15:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
259+
; RVA23-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
260+
; RVA23: middle.block:
261+
; RVA23-NEXT: [[TMP16:%.*]] = call i8 @llvm.vector.reduce.or.nxv2i8(<vscale x 2 x i8> [[TMP14]])
262+
; RVA23-NEXT: br label [[EXIT:%.*]]
263+
; RVA23: exit:
264+
; RVA23-NEXT: ret i8 [[TMP16]]
265+
;
266+
; RVA23ZVL1024B-LABEL: @mixed_gather_scatters(
267+
; RVA23ZVL1024B-NEXT: entry:
268+
; RVA23ZVL1024B-NEXT: br label [[VECTOR_PH:%.*]]
269+
; RVA23ZVL1024B: vector.ph:
270+
; RVA23ZVL1024B-NEXT: br label [[VECTOR_BODY:%.*]]
271+
; RVA23ZVL1024B: vector.body:
272+
; RVA23ZVL1024B-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 1 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
273+
; RVA23ZVL1024B-NEXT: [[AVL:%.*]] = phi i32 [ 10, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
274+
; RVA23ZVL1024B-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 1, i1 true)
275+
; RVA23ZVL1024B-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A:%.*]], align 8
276+
; RVA23ZVL1024B-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x ptr> poison, ptr [[TMP1]], i64 0
277+
; RVA23ZVL1024B-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 1 x ptr> poison, <vscale x 1 x i32> zeroinitializer
278+
; RVA23ZVL1024B-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 1 x i64> @llvm.vp.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> align 8 [[BROADCAST_SPLAT]], <vscale x 1 x i1> splat (i1 true), i32 [[TMP0]])
279+
; RVA23ZVL1024B-NEXT: [[TMP2:%.*]] = icmp sgt <vscale x 1 x i64> [[WIDE_MASKED_GATHER]], zeroinitializer
280+
; RVA23ZVL1024B-NEXT: [[TMP3:%.*]] = zext <vscale x 1 x i1> [[TMP2]] to <vscale x 1 x i8>
281+
; RVA23ZVL1024B-NEXT: [[TMP4:%.*]] = or <vscale x 1 x i8> [[VEC_PHI]], [[TMP3]]
282+
; RVA23ZVL1024B-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B:%.*]], align 8
283+
; RVA23ZVL1024B-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 1 x ptr> poison, ptr [[TMP5]], i64 0
284+
; RVA23ZVL1024B-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 1 x ptr> [[BROADCAST_SPLATINSERT1]], <vscale x 1 x ptr> poison, <vscale x 1 x i32> zeroinitializer
285+
; RVA23ZVL1024B-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 1 x i64> @llvm.vp.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> align 8 [[BROADCAST_SPLAT2]], <vscale x 1 x i1> splat (i1 true), i32 [[TMP0]])
286+
; RVA23ZVL1024B-NEXT: [[TMP6:%.*]] = icmp sgt <vscale x 1 x i64> [[WIDE_MASKED_GATHER3]], zeroinitializer
287+
; RVA23ZVL1024B-NEXT: [[TMP7:%.*]] = zext <vscale x 1 x i1> [[TMP6]] to <vscale x 1 x i8>
288+
; RVA23ZVL1024B-NEXT: [[TMP8:%.*]] = or <vscale x 1 x i8> [[TMP4]], [[TMP7]]
289+
; RVA23ZVL1024B-NEXT: [[TMP9:%.*]] = or <vscale x 1 x i8> [[TMP8]], splat (i8 1)
290+
; RVA23ZVL1024B-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C:%.*]], align 8
291+
; RVA23ZVL1024B-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <vscale x 1 x ptr> poison, ptr [[TMP10]], i64 0
292+
; RVA23ZVL1024B-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <vscale x 1 x ptr> [[BROADCAST_SPLATINSERT4]], <vscale x 1 x ptr> poison, <vscale x 1 x i32> zeroinitializer
293+
; RVA23ZVL1024B-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 1 x i64> @llvm.vp.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> align 8 [[BROADCAST_SPLAT5]], <vscale x 1 x i1> splat (i1 true), i32 [[TMP0]])
294+
; RVA23ZVL1024B-NEXT: [[TMP11:%.*]] = icmp sgt <vscale x 1 x i64> [[WIDE_MASKED_GATHER6]], zeroinitializer
295+
; RVA23ZVL1024B-NEXT: [[TMP12:%.*]] = zext <vscale x 1 x i1> [[TMP11]] to <vscale x 1 x i8>
296+
; RVA23ZVL1024B-NEXT: [[TMP13:%.*]] = or <vscale x 1 x i8> [[TMP9]], [[TMP12]]
297+
; RVA23ZVL1024B-NEXT: [[TMP14]] = call <vscale x 1 x i8> @llvm.vp.merge.nxv1i8(<vscale x 1 x i1> splat (i1 true), <vscale x 1 x i8> [[TMP13]], <vscale x 1 x i8> [[VEC_PHI]], i32 [[TMP0]])
298+
; RVA23ZVL1024B-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP0]]
299+
; RVA23ZVL1024B-NEXT: [[TMP15:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
300+
; RVA23ZVL1024B-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
301+
; RVA23ZVL1024B: middle.block:
302+
; RVA23ZVL1024B-NEXT: [[TMP16:%.*]] = call i8 @llvm.vector.reduce.or.nxv1i8(<vscale x 1 x i8> [[TMP14]])
303+
; RVA23ZVL1024B-NEXT: br label [[EXIT:%.*]]
304+
; RVA23ZVL1024B: exit:
305+
; RVA23ZVL1024B-NEXT: ret i8 [[TMP16]]
306+
;
307+
entry:
308+
br label %loop
309+
310+
loop:
311+
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
312+
%accum = phi i8 [ 0, %entry ], [ %or.4, %loop ]
313+
%ptr.0 = load ptr, ptr %A, align 8
314+
%val.0 = load i64, ptr %ptr.0, align 8
315+
%cmp.0 = icmp sgt i64 %val.0, 0
316+
%ext.0 = zext i1 %cmp.0 to i8
317+
%or.0 = or i8 %accum, %ext.0
318+
%ptr.1 = load ptr, ptr %B, align 8
319+
%val.1 = load i64, ptr %ptr.1, align 8
320+
%cmp.1 = icmp sgt i64 %val.1, 0
321+
%ext.1 = zext i1 %cmp.1 to i8
322+
%or.1 = or i8 %or.0, %ext.1
323+
%or.2 = or i8 %or.1, 1
324+
%ptr.4 = load ptr, ptr %C, align 8
325+
%val.4 = load i64, ptr %ptr.4, align 8
326+
%cmp.4 = icmp sgt i64 %val.4, 0
327+
%ext.4 = zext i1 %cmp.4 to i8
328+
%or.4 = or i8 %or.2, %ext.4
329+
%iv.next = add i32 %iv, 1
330+
%exitcond = icmp eq i32 %iv, 9
331+
br i1 %exitcond, label %exit, label %loop
332+
333+
exit:
334+
ret i8 %or.4
335+
}
336+
337+
attributes #0 = { "target-features"="+zve64x,+zvl256b" }

0 commit comments

Comments
 (0)