@@ -1706,7 +1706,8 @@ class LoopVectorizationCostModel {
1706
1706
// / disabled or unsupported, then the scalable part will be equal to
1707
1707
// / ElementCount::getScalable(0).
1708
1708
FixedScalableVFPair computeFeasibleMaxVF (unsigned ConstTripCount,
1709
- ElementCount UserVF);
1709
+ ElementCount UserVF,
1710
+ bool FoldTailByMasking);
1710
1711
1711
1712
// / \return the maximized element count based on the targets vector
1712
1713
// / registers and the loop trip-count, but limited to a maximum safe VF.
@@ -1719,7 +1720,8 @@ class LoopVectorizationCostModel {
1719
1720
ElementCount getMaximizedVFForTarget (unsigned ConstTripCount,
1720
1721
unsigned SmallestType,
1721
1722
unsigned WidestType,
1722
- const ElementCount &MaxSafeVF);
1723
+ const ElementCount &MaxSafeVF,
1724
+ bool FoldTailByMasking);
1723
1725
1724
1726
// / \return the maximum legal scalable VF, based on the safe max number
1725
1727
// / of elements.
@@ -5317,9 +5319,8 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
5317
5319
return MaxScalableVF;
5318
5320
}
5319
5321
5320
- FixedScalableVFPair
5321
- LoopVectorizationCostModel::computeFeasibleMaxVF (unsigned ConstTripCount,
5322
- ElementCount UserVF) {
5322
+ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF (
5323
+ unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
5323
5324
MinBWs = computeMinimumValueSizes (TheLoop->getBlocks (), *DB, &TTI);
5324
5325
unsigned SmallestType, WidestType;
5325
5326
std::tie (SmallestType, WidestType) = getSmallestAndWidestTypes ();
@@ -5406,12 +5407,14 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
5406
5407
5407
5408
FixedScalableVFPair Result (ElementCount::getFixed (1 ),
5408
5409
ElementCount::getScalable (0 ));
5409
- if (auto MaxVF = getMaximizedVFForTarget (ConstTripCount, SmallestType,
5410
- WidestType, MaxSafeFixedVF))
5410
+ if (auto MaxVF =
5411
+ getMaximizedVFForTarget (ConstTripCount, SmallestType, WidestType,
5412
+ MaxSafeFixedVF, FoldTailByMasking))
5411
5413
Result.FixedVF = MaxVF;
5412
5414
5413
- if (auto MaxVF = getMaximizedVFForTarget (ConstTripCount, SmallestType,
5414
- WidestType, MaxSafeScalableVF))
5415
+ if (auto MaxVF =
5416
+ getMaximizedVFForTarget (ConstTripCount, SmallestType, WidestType,
5417
+ MaxSafeScalableVF, FoldTailByMasking))
5415
5418
if (MaxVF.isScalable ()) {
5416
5419
Result.ScalableVF = MaxVF;
5417
5420
LLVM_DEBUG (dbgs () << " LV: Found feasible scalable VF = " << MaxVF
@@ -5444,7 +5447,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5444
5447
5445
5448
switch (ScalarEpilogueStatus) {
5446
5449
case CM_ScalarEpilogueAllowed:
5447
- return computeFeasibleMaxVF (TC, UserVF);
5450
+ return computeFeasibleMaxVF (TC, UserVF, false );
5448
5451
case CM_ScalarEpilogueNotAllowedUsePredicate:
5449
5452
LLVM_FALLTHROUGH;
5450
5453
case CM_ScalarEpilogueNotNeededUsePredicate:
@@ -5482,7 +5485,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5482
5485
LLVM_DEBUG (dbgs () << " LV: Cannot fold tail by masking: vectorize with a "
5483
5486
" scalar epilogue instead.\n " );
5484
5487
ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5485
- return computeFeasibleMaxVF (TC, UserVF);
5488
+ return computeFeasibleMaxVF (TC, UserVF, false );
5486
5489
}
5487
5490
return FixedScalableVFPair::getNone ();
5488
5491
}
@@ -5499,7 +5502,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5499
5502
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue ();
5500
5503
}
5501
5504
5502
- FixedScalableVFPair MaxFactors = computeFeasibleMaxVF (TC, UserVF);
5505
+ FixedScalableVFPair MaxFactors = computeFeasibleMaxVF (TC, UserVF, true );
5503
5506
// Avoid tail folding if the trip count is known to be a multiple of any VF
5504
5507
// we chose.
5505
5508
// FIXME: The condition below pessimises the case for fixed-width vectors,
@@ -5572,7 +5575,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5572
5575
5573
5576
ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget (
5574
5577
unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5575
- const ElementCount &MaxSafeVF) {
5578
+ const ElementCount &MaxSafeVF, bool FoldTailByMasking ) {
5576
5579
bool ComputeScalableMaxVF = MaxSafeVF.isScalable ();
5577
5580
TypeSize WidestRegister = TTI.getRegisterBitWidth (
5578
5581
ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
@@ -5604,14 +5607,17 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5604
5607
const auto TripCountEC = ElementCount::getFixed (ConstTripCount);
5605
5608
if (ConstTripCount &&
5606
5609
ElementCount::isKnownLE (TripCountEC, MaxVectorElementCount) &&
5607
- isPowerOf2_32 (ConstTripCount)) {
5608
- // We need to clamp the VF to be the ConstTripCount. There is no point in
5609
- // choosing a higher viable VF as done in the loop below. If
5610
- // MaxVectorElementCount is scalable, we only fall back on a fixed VF when
5611
- // the TC is less than or equal to the known number of lanes.
5612
- LLVM_DEBUG (dbgs () << " LV: Clamping the MaxVF to the constant trip count: "
5610
+ (!FoldTailByMasking || isPowerOf2_32 (ConstTripCount))) {
5611
+ // If loop trip count (TC) is known at compile time there is no point in
5612
+ // choosing VF greater than TC (as done in the loop below). Select maximum
5613
+ // power of two which doesn't exceed TC.
5614
+ // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5615
+ // when the TC is less than or equal to the known number of lanes.
5616
+ auto ClampedConstTripCount = PowerOf2Floor (ConstTripCount);
5617
+ LLVM_DEBUG (dbgs () << " LV: Clamping the MaxVF to maximum power of two not "
5618
+ " exceeding the constant trip count: "
5613
5619
<< ConstTripCount << " \n " );
5614
- return TripCountEC ;
5620
+ return ElementCount::getFixed (ClampedConstTripCount) ;
5615
5621
}
5616
5622
5617
5623
ElementCount MaxVF = MaxVectorElementCount;
0 commit comments