Skip to content

Commit 2025e09

Browse files
author
Evgeniy Brevnov
committed
[LV] Make sure VF doesn't exceed compile time known TC
For the simple copy loop (see test case) vectorizer selects VF equal to 32 while the loop is known to have 17 iterations only. Such behavior makes no sense to me since such vector loop will never be executed. The only case we may want to select VF large than TC is masked vectoriztion. So I haven't touched that case. Reviewed By: dmgreen Differential Revision: https://reviews.llvm.org/D114528
1 parent 9115d75 commit 2025e09

File tree

2 files changed

+44
-39
lines changed

2 files changed

+44
-39
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 26 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1706,7 +1706,8 @@ class LoopVectorizationCostModel {
17061706
/// disabled or unsupported, then the scalable part will be equal to
17071707
/// ElementCount::getScalable(0).
17081708
FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1709-
ElementCount UserVF);
1709+
ElementCount UserVF,
1710+
bool FoldTailByMasking);
17101711

17111712
/// \return the maximized element count based on the targets vector
17121713
/// registers and the loop trip-count, but limited to a maximum safe VF.
@@ -1719,7 +1720,8 @@ class LoopVectorizationCostModel {
17191720
ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
17201721
unsigned SmallestType,
17211722
unsigned WidestType,
1722-
const ElementCount &MaxSafeVF);
1723+
const ElementCount &MaxSafeVF,
1724+
bool FoldTailByMasking);
17231725

17241726
/// \return the maximum legal scalable VF, based on the safe max number
17251727
/// of elements.
@@ -5317,9 +5319,8 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
53175319
return MaxScalableVF;
53185320
}
53195321

5320-
FixedScalableVFPair
5321-
LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
5322-
ElementCount UserVF) {
5322+
FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
5323+
unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
53235324
MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
53245325
unsigned SmallestType, WidestType;
53255326
std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
@@ -5406,12 +5407,14 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
54065407

54075408
FixedScalableVFPair Result(ElementCount::getFixed(1),
54085409
ElementCount::getScalable(0));
5409-
if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
5410-
WidestType, MaxSafeFixedVF))
5410+
if (auto MaxVF =
5411+
getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5412+
MaxSafeFixedVF, FoldTailByMasking))
54115413
Result.FixedVF = MaxVF;
54125414

5413-
if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
5414-
WidestType, MaxSafeScalableVF))
5415+
if (auto MaxVF =
5416+
getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5417+
MaxSafeScalableVF, FoldTailByMasking))
54155418
if (MaxVF.isScalable()) {
54165419
Result.ScalableVF = MaxVF;
54175420
LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
@@ -5444,7 +5447,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
54445447

54455448
switch (ScalarEpilogueStatus) {
54465449
case CM_ScalarEpilogueAllowed:
5447-
return computeFeasibleMaxVF(TC, UserVF);
5450+
return computeFeasibleMaxVF(TC, UserVF, false);
54485451
case CM_ScalarEpilogueNotAllowedUsePredicate:
54495452
LLVM_FALLTHROUGH;
54505453
case CM_ScalarEpilogueNotNeededUsePredicate:
@@ -5482,7 +5485,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
54825485
LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
54835486
"scalar epilogue instead.\n");
54845487
ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5485-
return computeFeasibleMaxVF(TC, UserVF);
5488+
return computeFeasibleMaxVF(TC, UserVF, false);
54865489
}
54875490
return FixedScalableVFPair::getNone();
54885491
}
@@ -5499,7 +5502,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
54995502
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
55005503
}
55015504

5502-
FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF);
5505+
FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
55035506
// Avoid tail folding if the trip count is known to be a multiple of any VF
55045507
// we chose.
55055508
// FIXME: The condition below pessimises the case for fixed-width vectors,
@@ -5572,7 +5575,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
55725575

55735576
ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
55745577
unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5575-
const ElementCount &MaxSafeVF) {
5578+
const ElementCount &MaxSafeVF, bool FoldTailByMasking) {
55765579
bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
55775580
TypeSize WidestRegister = TTI.getRegisterBitWidth(
55785581
ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
@@ -5604,14 +5607,17 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
56045607
const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
56055608
if (ConstTripCount &&
56065609
ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
5607-
isPowerOf2_32(ConstTripCount)) {
5608-
// We need to clamp the VF to be the ConstTripCount. There is no point in
5609-
// choosing a higher viable VF as done in the loop below. If
5610-
// MaxVectorElementCount is scalable, we only fall back on a fixed VF when
5611-
// the TC is less than or equal to the known number of lanes.
5612-
LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5610+
(!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
5611+
// If loop trip count (TC) is known at compile time there is no point in
5612+
// choosing VF greater than TC (as done in the loop below). Select maximum
5613+
// power of two which doesn't exceed TC.
5614+
// If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5615+
// when the TC is less than or equal to the known number of lanes.
5616+
auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
5617+
LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
5618+
"exceeding the constant trip count: "
56135619
<< ConstTripCount << "\n");
5614-
return TripCountEC;
5620+
return ElementCount::getFixed(ClampedConstTripCount);
56155621
}
56165622

56175623
ElementCount MaxVF = MaxVectorElementCount;

llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,57 +4,56 @@
44
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128-ni:1-p2:32:8:8:32-ni:2"
55
target triple = "x86_64-unknown-linux-gnu"
66

7-
; TODO: Make sure selected VF for the main loop doesn't exceed TC.
87
; TODO: Make sure selected VF for the epilog loop doesn't exceed remaining TC.
98
define void @test1(i8 * noalias %src, i8 * noalias %dst) #0 {
109
; CHECK-LABEL: @test1(
1110
; CHECK-NEXT: iter.check:
12-
; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
11+
; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
1312
; CHECK: vector.main.loop.iter.check:
14-
; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
13+
; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
1514
; CHECK: vector.ph:
1615
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
1716
; CHECK: vector.body:
1817
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1918
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
2019
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP0]]
2120
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
22-
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <64 x i8>*
23-
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <64 x i8>, <64 x i8>* [[TMP3]], align 64
21+
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
22+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 64
2423
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[TMP0]]
2524
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
26-
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <64 x i8>*
27-
; CHECK-NEXT: store <64 x i8> [[WIDE_LOAD]], <64 x i8>* [[TMP6]], align 64
28-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
29-
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
25+
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <16 x i8>*
26+
; CHECK-NEXT: store <16 x i8> [[WIDE_LOAD]], <16 x i8>* [[TMP6]], align 64
27+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
28+
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
3029
; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
3130
; CHECK: middle.block:
32-
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 17, 0
31+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 17, 16
3332
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
3433
; CHECK: vec.epilog.iter.check:
3534
; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
3635
; CHECK: vec.epilog.ph:
37-
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
36+
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
3837
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
3938
; CHECK: vec.epilog.vector.body:
4039
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
4140
; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0
4241
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP8]]
4342
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP9]], i32 0
44-
; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <32 x i8>*
45-
; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <32 x i8>, <32 x i8>* [[TMP11]], align 64
43+
; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <8 x i8>*
44+
; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP11]], align 64
4645
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[TMP8]]
4746
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 0
48-
; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to <32 x i8>*
49-
; CHECK-NEXT: store <32 x i8> [[WIDE_LOAD4]], <32 x i8>* [[TMP14]], align 64
50-
; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 32
51-
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 0
47+
; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to <8 x i8>*
48+
; CHECK-NEXT: store <8 x i8> [[WIDE_LOAD4]], <8 x i8>* [[TMP14]], align 64
49+
; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 8
50+
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 16
5251
; CHECK-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
5352
; CHECK: vec.epilog.middle.block:
54-
; CHECK-NEXT: [[CMP_N3:%.*]] = icmp eq i64 17, 0
53+
; CHECK-NEXT: [[CMP_N3:%.*]] = icmp eq i64 17, 16
5554
; CHECK-NEXT: br i1 [[CMP_N3]], label [[EXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
5655
; CHECK: vec.epilog.scalar.ph:
57-
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
56+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
5857
; CHECK-NEXT: br label [[LOOP_MEMCPY_EXPANSION:%.*]]
5958
; CHECK: loop-memcpy-expansion:
6059
; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]

0 commit comments

Comments
 (0)