Skip to content

Commit 434f865

Browse files
committed
LoopVectorize: optimize VF for low TC, when tail-folding
9a087a3 (LoopVectorize: MaxVF should not be larger than the loop trip count) was the first commit to add the condition PowerOf2_32() of the trip-count to, what is now getMaximizedVFForTarget(). It made sense at the time, as there was no tail-folding support. Much later, 2025e09 ([LV] Make sure VF doesn't exceed compile time known TC) came along to patch this with an extra condition on FoldTailByMasking, in order to ensure that that the VF doesn't exceed the trip-count. However, it didn't go far enough, and we can do better, as there is existing code to clamp the trip-count, and do tail-folding. Fixes #82626.
1 parent c0e6dd1 commit 434f865

File tree

3 files changed

+98
-73
lines changed

3 files changed

+98
-73
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 11 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4720,26 +4720,21 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
47204720
WidestRegisterMinEC *= Min;
47214721
}
47224722

4723-
// When a scalar epilogue is required, at least one iteration of the scalar
4724-
// loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4725-
// max VF that results in a dead vector loop.
4726-
if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4727-
MaxTripCount -= 1;
4728-
4729-
if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4730-
(!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4731-
// If upper bound loop trip count (TC) is known at compile time there is no
4732-
// point in choosing VF greater than TC (as done in the loop below). Select
4733-
// maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4734-
// scalable, we only fall back on a fixed VF when the TC is less than or
4735-
// equal to the known number of lanes.
4723+
if (MaxTripCount > 0 && MaxTripCount <= WidestRegisterMinEC &&
4724+
requiresScalarEpilogue(true)) {
4725+
// When a scalar epilogue is required, at least one iteration of the scalar
4726+
// loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4727+
// max VF that results in a dead vector loop.
4728+
--MaxTripCount;
4729+
4730+
// When a scalar epilogue is required, if upper bound loop trip count (TC)
4731+
// is known at compile time, clamp the VF to a maximum power of two which
4732+
// doesn't exceed TC, and vectorize using a fixed-length vector.
47364733
auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
47374734
LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
47384735
"exceeding the constant trip count: "
47394736
<< ClampedUpperTripCount << "\n");
4740-
return ElementCount::get(
4741-
ClampedUpperTripCount,
4742-
FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4737+
return ElementCount::getFixed(ClampedUpperTripCount);
47434738
}
47444739

47454740
TargetTransformInfo::RegisterKind RegKind =

llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll

Lines changed: 86 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -258,36 +258,46 @@ for.end: ; preds = %for.body
258258
define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
259259
; CHECK-LABEL: @trip16_i8(
260260
; CHECK-NEXT: entry:
261-
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
261+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
262+
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
263+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]]
264+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
262265
; CHECK: vector.ph:
266+
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
267+
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
268+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]]
269+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]]
270+
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
271+
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
263272
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
264273
; CHECK: vector.body:
265274
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
266-
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
267-
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
268-
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
269-
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
270-
; CHECK-NEXT: [[TMP3:%.*]] = shl <16 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
271-
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
272-
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
273-
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
274-
; CHECK-NEXT: [[TMP6:%.*]] = add <16 x i8> [[TMP3]], [[WIDE_LOAD1]]
275-
; CHECK-NEXT: store <16 x i8> [[TMP6]], ptr [[TMP5]], align 1
276-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
275+
; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
276+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP6]]
277+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
278+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
279+
; CHECK-NEXT: [[TMP9:%.*]] = shl <vscale x 8 x i8> [[WIDE_LOAD]], shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 1, i64 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
280+
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP6]]
281+
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
282+
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
283+
; CHECK-NEXT: [[TMP12:%.*]] = add <vscale x 8 x i8> [[TMP9]], [[WIDE_LOAD1]]
284+
; CHECK-NEXT: store <vscale x 8 x i8> [[TMP12]], ptr [[TMP11]], align 1
285+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
277286
; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
278287
; CHECK: middle.block:
279-
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
288+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]]
289+
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
280290
; CHECK: scalar.ph:
281-
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
291+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
282292
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
283293
; CHECK: for.body:
284294
; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
285295
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
286-
; CHECK-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
287-
; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP7]], 1
296+
; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
297+
; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP13]], 1
288298
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
289-
; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
290-
; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP8]]
299+
; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
300+
; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP14]]
291301
; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
292302
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1
293303
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16
@@ -319,36 +329,46 @@ for.end: ; preds = %for.body
319329
define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
320330
; CHECK-LABEL: @trip32_i8(
321331
; CHECK-NEXT: entry:
322-
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
332+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
333+
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
334+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 32, [[TMP1]]
335+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
323336
; CHECK: vector.ph:
337+
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
338+
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
339+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 32, [[TMP3]]
340+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 32, [[N_MOD_VF]]
341+
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
342+
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
324343
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
325344
; CHECK: vector.body:
326345
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
327-
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
328-
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
329-
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
330-
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP2]], align 1
331-
; CHECK-NEXT: [[TMP3:%.*]] = shl <32 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
332-
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
333-
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
334-
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP5]], align 1
335-
; CHECK-NEXT: [[TMP6:%.*]] = add <32 x i8> [[TMP3]], [[WIDE_LOAD1]]
336-
; CHECK-NEXT: store <32 x i8> [[TMP6]], ptr [[TMP5]], align 1
337-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
346+
; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
347+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP6]]
348+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
349+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
350+
; CHECK-NEXT: [[TMP9:%.*]] = shl <vscale x 16 x i8> [[WIDE_LOAD]], shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
351+
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP6]]
352+
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
353+
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
354+
; CHECK-NEXT: [[TMP12:%.*]] = add <vscale x 16 x i8> [[TMP9]], [[WIDE_LOAD1]]
355+
; CHECK-NEXT: store <vscale x 16 x i8> [[TMP12]], ptr [[TMP11]], align 1
356+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
338357
; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
339358
; CHECK: middle.block:
340-
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
359+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 32, [[N_VEC]]
360+
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
341361
; CHECK: scalar.ph:
342-
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
362+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
343363
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
344364
; CHECK: for.body:
345365
; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
346366
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
347-
; CHECK-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
348-
; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP7]], 1
367+
; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
368+
; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP13]], 1
349369
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
350-
; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
351-
; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP8]]
370+
; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
371+
; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP14]]
352372
; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
353373
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1
354374
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 32
@@ -379,37 +399,47 @@ for.end: ; preds = %for.body
379399
define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
380400
; CHECK-LABEL: @trip24_i8(
381401
; CHECK-NEXT: entry:
382-
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
402+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
403+
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
404+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 24, [[TMP1]]
405+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
383406
; CHECK: vector.ph:
407+
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
408+
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
409+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 24, [[TMP3]]
410+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 24, [[N_MOD_VF]]
411+
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
412+
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
384413
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
385414
; CHECK: vector.body:
386415
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
387-
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
388-
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
389-
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
390-
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
391-
; CHECK-NEXT: [[TMP3:%.*]] = shl <8 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
392-
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
393-
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
394-
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
395-
; CHECK-NEXT: [[TMP6:%.*]] = add <8 x i8> [[TMP3]], [[WIDE_LOAD1]]
396-
; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[TMP5]], align 1
397-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
398-
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
399-
; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
416+
; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
417+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP6]]
418+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
419+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
420+
; CHECK-NEXT: [[TMP9:%.*]] = shl <vscale x 4 x i8> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i8> insertelement (<vscale x 4 x i8> poison, i8 1, i64 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer)
421+
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP6]]
422+
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
423+
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
424+
; CHECK-NEXT: [[TMP12:%.*]] = add <vscale x 4 x i8> [[TMP9]], [[WIDE_LOAD1]]
425+
; CHECK-NEXT: store <vscale x 4 x i8> [[TMP12]], ptr [[TMP11]], align 1
426+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
427+
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
428+
; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
400429
; CHECK: middle.block:
401-
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
430+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 24, [[N_VEC]]
431+
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
402432
; CHECK: scalar.ph:
403-
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 24, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
433+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
404434
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
405435
; CHECK: for.body:
406436
; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
407437
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
408-
; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
409-
; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP8]], 1
438+
; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
439+
; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP14]], 1
410440
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
411-
; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
412-
; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP9]]
441+
; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
442+
; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]]
413443
; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
414444
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1
415445
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 24

llvm/test/Transforms/LoopVectorize/vplan-incomplete-cases.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ define void @vplan_incomplete_cases_tc2(i8 %x, i8 %y) {
1111
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
1212
; CHECK: [[VECTOR_BODY]]:
1313
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
14-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
14+
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2
1515
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
1616
; CHECK: [[MIDDLE_BLOCK]]:
1717
; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]

0 commit comments

Comments
 (0)