Skip to content

Commit b93a9c3

Browse files
committed
LoopVectorize: optimize VF for low TC, when tail-folding
9a087a3 (LoopVectorize: MaxVF should not be larger than the loop trip count) was the first commit to add the condition PowerOf2_32() of the trip-count to, what is now getMaximizedVFForTarget(). It made sense at the time, as there was no tail-folding support. Much later, 2025e09 ([LV] Make sure VF doesn't exceed compile time known TC) came along to patch this with an extra condition on FoldTailByMasking, in order to ensure that that the VF doesn't exceed the trip-count. However, it didn't go far enough, and we can do better, as there is existing code to clamp the trip-count, and do tail-folding. Fixes #82626.
1 parent 58289fd commit b93a9c3

File tree

3 files changed

+98
-73
lines changed

3 files changed

+98
-73
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 11 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4758,26 +4758,21 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
47584758
WidestRegisterMinEC *= Min;
47594759
}
47604760

4761-
// When a scalar epilogue is required, at least one iteration of the scalar
4762-
// loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4763-
// max VF that results in a dead vector loop.
4764-
if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4765-
MaxTripCount -= 1;
4766-
4767-
if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4768-
(!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4769-
// If upper bound loop trip count (TC) is known at compile time there is no
4770-
// point in choosing VF greater than TC (as done in the loop below). Select
4771-
// maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4772-
// scalable, we only fall back on a fixed VF when the TC is less than or
4773-
// equal to the known number of lanes.
4761+
if (MaxTripCount > 0 && MaxTripCount <= WidestRegisterMinEC &&
4762+
requiresScalarEpilogue(true)) {
4763+
// When a scalar epilogue is required, at least one iteration of the scalar
4764+
// loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4765+
// max VF that results in a dead vector loop.
4766+
--MaxTripCount;
4767+
4768+
// When a scalar epilogue is required, if upper bound loop trip count (TC)
4769+
// is known at compile time, clamp the VF to a maximum power of two which
4770+
// doesn't exceed TC, and vectorize using a fixed-length vector.
47744771
auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
47754772
LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
47764773
"exceeding the constant trip count: "
47774774
<< ClampedUpperTripCount << "\n");
4778-
return ElementCount::get(
4779-
ClampedUpperTripCount,
4780-
FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4775+
return ElementCount::getFixed(ClampedUpperTripCount);
47814776
}
47824777

47834778
TargetTransformInfo::RegisterKind RegKind =

llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll

Lines changed: 86 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -258,36 +258,46 @@ for.end: ; preds = %for.body
258258
define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
259259
; CHECK-LABEL: @trip16_i8(
260260
; CHECK-NEXT: entry:
261-
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
261+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
262+
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
263+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]]
264+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
262265
; CHECK: vector.ph:
266+
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
267+
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
268+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]]
269+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]]
270+
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
271+
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
263272
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
264273
; CHECK: vector.body:
265274
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
266-
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
267-
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
268-
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
269-
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
270-
; CHECK-NEXT: [[TMP3:%.*]] = shl <16 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
271-
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
272-
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
273-
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
274-
; CHECK-NEXT: [[TMP6:%.*]] = add <16 x i8> [[TMP3]], [[WIDE_LOAD1]]
275-
; CHECK-NEXT: store <16 x i8> [[TMP6]], ptr [[TMP5]], align 1
276-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
275+
; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
276+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP6]]
277+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
278+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
279+
; CHECK-NEXT: [[TMP9:%.*]] = shl <vscale x 8 x i8> [[WIDE_LOAD]], shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 1, i64 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
280+
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP6]]
281+
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
282+
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
283+
; CHECK-NEXT: [[TMP12:%.*]] = add <vscale x 8 x i8> [[TMP9]], [[WIDE_LOAD1]]
284+
; CHECK-NEXT: store <vscale x 8 x i8> [[TMP12]], ptr [[TMP11]], align 1
285+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
277286
; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
278287
; CHECK: middle.block:
279-
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
288+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]]
289+
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
280290
; CHECK: scalar.ph:
281-
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
291+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
282292
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
283293
; CHECK: for.body:
284294
; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
285295
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
286-
; CHECK-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
287-
; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP7]], 1
296+
; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
297+
; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP13]], 1
288298
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
289-
; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
290-
; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP8]]
299+
; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
300+
; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP14]]
291301
; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
292302
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1
293303
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16
@@ -319,36 +329,46 @@ for.end: ; preds = %for.body
319329
define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
320330
; CHECK-LABEL: @trip32_i8(
321331
; CHECK-NEXT: entry:
322-
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
332+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
333+
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
334+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 32, [[TMP1]]
335+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
323336
; CHECK: vector.ph:
337+
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
338+
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
339+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 32, [[TMP3]]
340+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 32, [[N_MOD_VF]]
341+
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
342+
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
324343
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
325344
; CHECK: vector.body:
326345
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
327-
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
328-
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
329-
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
330-
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP2]], align 1
331-
; CHECK-NEXT: [[TMP3:%.*]] = shl <32 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
332-
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
333-
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
334-
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP5]], align 1
335-
; CHECK-NEXT: [[TMP6:%.*]] = add <32 x i8> [[TMP3]], [[WIDE_LOAD1]]
336-
; CHECK-NEXT: store <32 x i8> [[TMP6]], ptr [[TMP5]], align 1
337-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
346+
; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
347+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP6]]
348+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
349+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
350+
; CHECK-NEXT: [[TMP9:%.*]] = shl <vscale x 16 x i8> [[WIDE_LOAD]], shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
351+
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP6]]
352+
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
353+
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
354+
; CHECK-NEXT: [[TMP12:%.*]] = add <vscale x 16 x i8> [[TMP9]], [[WIDE_LOAD1]]
355+
; CHECK-NEXT: store <vscale x 16 x i8> [[TMP12]], ptr [[TMP11]], align 1
356+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
338357
; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
339358
; CHECK: middle.block:
340-
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
359+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 32, [[N_VEC]]
360+
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
341361
; CHECK: scalar.ph:
342-
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
362+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
343363
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
344364
; CHECK: for.body:
345365
; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
346366
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
347-
; CHECK-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
348-
; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP7]], 1
367+
; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
368+
; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP13]], 1
349369
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
350-
; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
351-
; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP8]]
370+
; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
371+
; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP14]]
352372
; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
353373
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1
354374
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 32
@@ -379,37 +399,47 @@ for.end: ; preds = %for.body
379399
define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
380400
; CHECK-LABEL: @trip24_i8(
381401
; CHECK-NEXT: entry:
382-
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
402+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
403+
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
404+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 24, [[TMP1]]
405+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
383406
; CHECK: vector.ph:
407+
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
408+
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
409+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 24, [[TMP3]]
410+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 24, [[N_MOD_VF]]
411+
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
412+
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
384413
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
385414
; CHECK: vector.body:
386415
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
387-
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
388-
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
389-
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
390-
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
391-
; CHECK-NEXT: [[TMP3:%.*]] = shl <8 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
392-
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
393-
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
394-
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
395-
; CHECK-NEXT: [[TMP6:%.*]] = add <8 x i8> [[TMP3]], [[WIDE_LOAD1]]
396-
; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[TMP5]], align 1
397-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
398-
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
399-
; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
416+
; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
417+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP6]]
418+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
419+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
420+
; CHECK-NEXT: [[TMP9:%.*]] = shl <vscale x 4 x i8> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i8> insertelement (<vscale x 4 x i8> poison, i8 1, i64 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer)
421+
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP6]]
422+
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
423+
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
424+
; CHECK-NEXT: [[TMP12:%.*]] = add <vscale x 4 x i8> [[TMP9]], [[WIDE_LOAD1]]
425+
; CHECK-NEXT: store <vscale x 4 x i8> [[TMP12]], ptr [[TMP11]], align 1
426+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
427+
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
428+
; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
400429
; CHECK: middle.block:
401-
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
430+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 24, [[N_VEC]]
431+
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
402432
; CHECK: scalar.ph:
403-
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 24, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
433+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
404434
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
405435
; CHECK: for.body:
406436
; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
407437
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
408-
; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
409-
; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP8]], 1
438+
; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
439+
; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP14]], 1
410440
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
411-
; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
412-
; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP9]]
441+
; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
442+
; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]]
413443
; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
414444
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1
415445
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 24

llvm/test/Transforms/LoopVectorize/vplan-incomplete-cases.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ define void @vplan_incomplete_cases_tc2(i8 %x, i8 %y) {
1111
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
1212
; CHECK: [[VECTOR_BODY]]:
1313
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
14-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
14+
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2
1515
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
1616
; CHECK: [[MIDDLE_BLOCK]]:
1717
; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]

0 commit comments

Comments
 (0)