Skip to content

Commit ca000fb

Browse files
committed
[LoopVectorize] Perform loop versioning for some early exit loops
When attempting to vectorise a loop with an uncountable early exit, we attempt to discover if all the loads in the loop are known to be dereferenceable. If at least one load could potentially fault then we abandon vectorisation. This patch adds support for vectorising loops with one potentially faulting load by versioning the loop based on the load pointer alignment. It is required that the vector load must always fault on the first lane, i.e. the load should not straddle a page boundary. Doing so ensures that the behaviour of the vector and scalar loops is identical, i.e. if a load does fault it will fault at the same scalar iteration. Such vectorisation depends on the following conditions being met: 1. The max vector width must not exceed the minimum page size. This is done by adding a getMaxSafeVectorWidthInBits wrapper that checks if we have an uncountable early exit. For scalable vectors we must be able to determine the maximum possible value of vscale. 2. The size of the loaded type must be a power of 2. This is checked during legalisation. 3. The VF must be a power of two (so that the vector width can divide wholly into the page size which is also power of 2). For fixed-width vectors this is always true, and for scalable vectors we query the TTI hook isVScaleKnownToBeAPowerOfTwo. If the effective runtime VF could change during the loop then this cannot be vectorised via loop versioning. 4. The load pointer must be aligned to a multiple of the vector width. (NOTE: interleaving is currently disabled for these early exit loops.) We add a runtime check to ensure this is true.
1 parent 773f432 commit ca000fb

File tree

7 files changed

+444
-74
lines changed

7 files changed

+444
-74
lines changed

llvm/include/llvm/Analysis/Loads.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,8 @@ bool isDereferenceableAndAlignedInLoop(
9292
/// contains read-only memory accesses.
9393
bool isDereferenceableReadOnlyLoop(
9494
Loop *L, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
95-
SmallVectorImpl<const SCEVPredicate *> *Predicates = nullptr);
95+
SmallVectorImpl<const SCEVPredicate *> *Predicates = nullptr,
96+
SmallVectorImpl<LoadInst *> *NonDerefLoads = nullptr);
9697

9798
/// Return true if we know that executing a load from this value cannot trap.
9899
///

llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -429,6 +429,19 @@ class LoopVectorizationLegality {
429429
unsigned getNumStores() const { return LAI->getNumStores(); }
430430
unsigned getNumLoads() const { return LAI->getNumLoads(); }
431431

432+
/// Return the number of loads in the loop that could potentially fault in a
433+
/// loop with uncountable early exits.
434+
unsigned getNumPotentiallyFaultingLoads() const {
435+
return PotentiallyFaultingLoads.size();
436+
}
437+
438+
/// Return a vector of all potentially faulting loads in a loop with
439+
/// uncountable early exits.
440+
const SmallVectorImpl<std::pair<LoadInst *, const SCEV *>> *
441+
getPotentiallyFaultingLoads() const {
442+
return &PotentiallyFaultingLoads;
443+
}
444+
432445
/// Returns a HistogramInfo* for the given instruction if it was determined
433446
/// to be part of a load -> update -> store sequence where multiple lanes
434447
/// may be working on the same memory address.
@@ -537,6 +550,11 @@ class LoopVectorizationLegality {
537550
/// additional cases safely.
538551
bool isVectorizableEarlyExitLoop();
539552

553+
/// Returns true if all loads in the loop contained in \p Loads can be
554+
/// analyzed as potentially faulting. Any loads that may fault are added to
555+
/// the member variable PotentiallyFaultingLoads.
556+
bool analyzePotentiallyFaultingLoads(SmallVectorImpl<LoadInst *> *Loads);
557+
540558
/// Return true if all of the instructions in the block can be speculatively
541559
/// executed, and record the loads/stores that require masking.
542560
/// \p SafePtrs is a list of addresses that are known to be legal and we know
@@ -666,6 +684,10 @@ class LoopVectorizationLegality {
666684
/// Keep track of the destinations of all uncountable exits if the
667685
/// exact backedge taken count is not computable.
668686
SmallVector<BasicBlock *, 4> UncountableExitBlocks;
687+
688+
/// Keep a record of all potentially faulting loads in loops with
689+
/// uncountable early exits.
690+
SmallVector<std::pair<LoadInst *, const SCEV *>, 4> PotentiallyFaultingLoads;
669691
};
670692

671693
} // namespace llvm

llvm/lib/Analysis/Loads.cpp

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -805,15 +805,26 @@ bool llvm::canReplacePointersIfEqual(const Value *From, const Value *To,
805805

806806
bool llvm::isDereferenceableReadOnlyLoop(
807807
Loop *L, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
808-
SmallVectorImpl<const SCEVPredicate *> *Predicates) {
808+
SmallVectorImpl<const SCEVPredicate *> *Predicates,
809+
SmallVectorImpl<LoadInst *> *NonDerefLoads) {
810+
bool Result = true;
809811
for (BasicBlock *BB : L->blocks()) {
810812
for (Instruction &I : *BB) {
811813
if (auto *LI = dyn_cast<LoadInst>(&I)) {
812-
if (!isDereferenceableAndAlignedInLoop(LI, L, *SE, *DT, AC, Predicates))
814+
if (!isDereferenceableAndAlignedInLoop(LI, L, *SE, *DT, AC,
815+
Predicates)) {
816+
if (!NonDerefLoads)
817+
return false;
818+
NonDerefLoads->push_back(LI);
819+
Result = false;
820+
}
821+
} else if (I.mayReadFromMemory() || I.mayWriteToMemory() ||
822+
I.mayThrow()) {
823+
if (!NonDerefLoads)
813824
return false;
814-
} else if (I.mayReadFromMemory() || I.mayWriteToMemory() || I.mayThrow())
815-
return false;
825+
Result = false;
826+
}
816827
}
817828
}
818-
return true;
829+
return Result;
819830
}

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

Lines changed: 48 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1609,6 +1609,43 @@ bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
16091609
return Result;
16101610
}
16111611

1612+
bool LoopVectorizationLegality::analyzePotentiallyFaultingLoads(
1613+
SmallVectorImpl<LoadInst *> *Loads) {
1614+
LLVM_DEBUG(dbgs() << "LV: Looking for potentially faulting loads in loop "
1615+
"with uncountable early exit:\n");
1616+
for (LoadInst *LI : *Loads) {
1617+
LLVM_DEBUG(dbgs() << "LV: Load: " << *LI << '\n');
1618+
Value *Ptr = LI->getPointerOperand();
1619+
if (!Ptr)
1620+
return false;
1621+
const SCEV *PtrExpr = PSE.getSCEV(Ptr);
1622+
const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrExpr);
1623+
// TODO: Deal with loop invariant pointers.
1624+
if (!AR || AR->getLoop() != TheLoop || !AR->isAffine())
1625+
return false;
1626+
auto Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*PSE.getSE()));
1627+
if (!Step)
1628+
return false;
1629+
const SCEV *Start = AR->getStart();
1630+
1631+
// Make sure the step is positive and matches the object size in memory.
1632+
// TODO: Extend this to cover more cases.
1633+
auto &DL = LI->getDataLayout();
1634+
APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getType()),
1635+
DL.getTypeStoreSize(LI->getType()).getFixedValue());
1636+
1637+
// Also discard element sizes that are not a power of 2, since the loop
1638+
// vectorizer can only perform loop versioning with pointer alignment
1639+
// checks for vector loads that are power-of-2 in size.
1640+
if (EltSize != Step->getAPInt() || !EltSize.isPowerOf2())
1641+
return false;
1642+
1643+
LLVM_DEBUG(dbgs() << "LV: SCEV for Load Ptr: " << *Start << '\n');
1644+
PotentiallyFaultingLoads.push_back({LI, Start});
1645+
}
1646+
return true;
1647+
}
1648+
16121649
bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
16131650
BasicBlock *LatchBB = TheLoop->getLoopLatch();
16141651
if (!LatchBB) {
@@ -1731,15 +1768,18 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
17311768
assert(LatchBB->getUniquePredecessor() == getUncountableEarlyExitingBlock() &&
17321769
"Expected latch predecessor to be the early exiting block");
17331770

1734-
// TODO: Handle loops that may fault.
17351771
Predicates.clear();
1736-
if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC,
1737-
&Predicates)) {
1738-
reportVectorizationFailure(
1739-
"Loop may fault",
1740-
"Cannot vectorize potentially faulting early exit loop",
1741-
"PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
1742-
return false;
1772+
SmallVector<LoadInst *, 4> Loads;
1773+
if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC, &Predicates,
1774+
&Loads)) {
1775+
if (!TTI->getMinPageSize() || !analyzePotentiallyFaultingLoads(&Loads)) {
1776+
reportVectorizationFailure(
1777+
"Loop may fault",
1778+
"Cannot vectorize potentially faulting early exit loop",
1779+
"PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
1780+
return false;
1781+
}
1782+
LLVM_DEBUG(dbgs() << "We can vectorize the loop with runtime checks.\n");
17431783
}
17441784

17451785
[[maybe_unused]] const SCEV *SymbolicMaxBTC =

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 101 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,12 @@ static cl::opt<bool> EnableEarlyExitVectorization(
390390
cl::desc(
391391
"Enable vectorization of early exit loops with uncountable exits."));
392392

393+
static cl::opt<unsigned> MaxNumPotentiallyFaultingPointers(
394+
"max-num-faulting-pointers", cl::init(1), cl::Hidden,
395+
cl::desc(
396+
"The maximum number of potentially faulting pointers we permit when "
397+
"vectorizing loops with uncountable exits."));
398+
393399
// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
394400
// variables not overflowing do not hold. See `emitSCEVChecks`.
395401
static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
@@ -1582,6 +1588,22 @@ class LoopVectorizationCostModel {
15821588
ElementCount MaxSafeVF,
15831589
bool FoldTailByMasking);
15841590

1591+
bool isSafeForAnyVectorWidth() const {
1592+
return Legal->isSafeForAnyVectorWidth() &&
1593+
(!Legal->hasUncountableEarlyExit() ||
1594+
!Legal->getNumPotentiallyFaultingLoads());
1595+
}
1596+
1597+
uint64_t getMaxSafeVectorWidthInBits() const {
1598+
uint64_t MaxSafeVectorWidth = Legal->getMaxSafeVectorWidthInBits();
1599+
// The legalizer bails out if getMinPageSize does not return a value.
1600+
if (Legal->hasUncountableEarlyExit() &&
1601+
Legal->getNumPotentiallyFaultingLoads())
1602+
MaxSafeVectorWidth =
1603+
std::min(MaxSafeVectorWidth, uint64_t(*TTI.getMinPageSize()) * 8);
1604+
return MaxSafeVectorWidth;
1605+
}
1606+
15851607
/// Checks if scalable vectorization is supported and enabled. Caches the
15861608
/// result to avoid repeated debug dumps for repeated queries.
15871609
bool isScalableVectorizationAllowed();
@@ -2123,6 +2145,41 @@ class GeneratedRTChecks {
21232145
};
21242146
} // namespace
21252147

2148+
std::optional<unsigned> getMaxVScale(const Function &F,
2149+
const TargetTransformInfo &TTI) {
2150+
if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2151+
return MaxVScale;
2152+
2153+
if (F.hasFnAttribute(Attribute::VScaleRange))
2154+
return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2155+
2156+
return std::nullopt;
2157+
}
2158+
2159+
static void addPointerAlignmentChecks(
2160+
const SmallVectorImpl<std::pair<LoadInst *, const SCEV *>> *Loads,
2161+
Function *F, PredicatedScalarEvolution &PSE, TargetTransformInfo *TTI,
2162+
ElementCount VF) {
2163+
ScalarEvolution *SE = PSE.getSE();
2164+
const DataLayout &DL = SE->getDataLayout();
2165+
Type *PtrIntType = DL.getIntPtrType(SE->getContext());
2166+
2167+
const SCEV *Zero = SE->getZero(PtrIntType);
2168+
const SCEV *ScevEC = SE->getElementCount(PtrIntType, VF);
2169+
2170+
for (auto Load : *Loads) {
2171+
APInt EltSize(
2172+
DL.getIndexTypeSizeInBits(Load.first->getPointerOperandType()),
2173+
DL.getTypeStoreSize(Load.first->getType()).getFixedValue());
2174+
const SCEV *Start = SE->getPtrToIntExpr(Load.second, PtrIntType);
2175+
const SCEV *Align =
2176+
SE->getMulExpr(ScevEC, SE->getConstant(EltSize),
2177+
(SCEV::NoWrapFlags)(SCEV::FlagNSW | SCEV::FlagNUW));
2178+
const SCEV *Rem = SE->getURemExpr(Start, Align);
2179+
PSE.addPredicate(*(SE->getEqualPredicate(Rem, Zero)));
2180+
}
2181+
}
2182+
21262183
static bool useActiveLaneMask(TailFoldingStyle Style) {
21272184
return Style == TailFoldingStyle::Data ||
21282185
Style == TailFoldingStyle::DataAndControlFlow ||
@@ -2292,17 +2349,6 @@ emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
22922349
llvm_unreachable("invalid enum");
22932350
}
22942351

2295-
std::optional<unsigned> getMaxVScale(const Function &F,
2296-
const TargetTransformInfo &TTI) {
2297-
if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2298-
return MaxVScale;
2299-
2300-
if (F.hasFnAttribute(Attribute::VScaleRange))
2301-
return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2302-
2303-
return std::nullopt;
2304-
}
2305-
23062352
/// For the given VF and UF and maximum trip count computed for the loop, return
23072353
/// whether the induction variable might overflow in the vectorized loop. If not,
23082354
/// then we know a runtime overflow check always evaluates to false and can be
@@ -3899,13 +3945,22 @@ bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
38993945
return false;
39003946
}
39013947

3902-
if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) {
3948+
if (!isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) {
39033949
reportVectorizationInfo("The target does not provide maximum vscale value "
39043950
"for safe distance analysis.",
39053951
"ScalableVFUnfeasible", ORE, TheLoop);
39063952
return false;
39073953
}
39083954

3955+
if (Legal->hasUncountableEarlyExit() &&
3956+
Legal->getNumPotentiallyFaultingLoads() &&
3957+
!TTI.isVScaleKnownToBeAPowerOfTwo()) {
3958+
reportVectorizationInfo("Cannot vectorize potentially faulting early exit "
3959+
"loop with scalable vectors.",
3960+
"ScalableVFUnfeasible", ORE, TheLoop);
3961+
return false;
3962+
}
3963+
39093964
IsScalableVectorizationAllowed = true;
39103965
return true;
39113966
}
@@ -3917,7 +3972,7 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
39173972

39183973
auto MaxScalableVF = ElementCount::getScalable(
39193974
std::numeric_limits<ElementCount::ScalarTy>::max());
3920-
if (Legal->isSafeForAnyVectorWidth())
3975+
if (isSafeForAnyVectorWidth())
39213976
return MaxScalableVF;
39223977

39233978
std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
@@ -3944,11 +3999,11 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
39443999
// the memory accesses that is most restrictive (involved in the smallest
39454000
// dependence distance).
39464001
unsigned MaxSafeElements =
3947-
llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
4002+
llvm::bit_floor(getMaxSafeVectorWidthInBits() / WidestType);
39484003

39494004
auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
39504005
auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
3951-
if (!Legal->isSafeForAnyVectorWidth())
4006+
if (!isSafeForAnyVectorWidth())
39524007
this->MaxSafeElements = MaxSafeElements;
39534008

39544009
LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
@@ -10346,11 +10401,25 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1034610401
return false;
1034710402
}
1034810403

10349-
if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) {
10350-
reportVectorizationFailure("Auto-vectorization of loops with uncountable "
10351-
"early exit is not enabled",
10352-
"UncountableEarlyExitLoopsDisabled", ORE, L);
10353-
return false;
10404+
if (LVL.hasUncountableEarlyExit()) {
10405+
if (!EnableEarlyExitVectorization) {
10406+
reportVectorizationFailure("Auto-vectorization of loops with uncountable "
10407+
"early exit is not enabled",
10408+
"UncountableEarlyExitLoopsDisabled", ORE, L);
10409+
return false;
10410+
}
10411+
10412+
unsigned NumPotentiallyFaultingPointers =
10413+
LVL.getNumPotentiallyFaultingLoads();
10414+
if (NumPotentiallyFaultingPointers > MaxNumPotentiallyFaultingPointers) {
10415+
reportVectorizationFailure("Not worth vectorizing loop with uncountable "
10416+
"early exit, due to number of potentially "
10417+
"faulting loads",
10418+
"UncountableEarlyExitMayFault", ORE, L);
10419+
return false;
10420+
} else if (NumPotentiallyFaultingPointers)
10421+
LLVM_DEBUG(dbgs() << "LV: Need to version early-exit vector loop with "
10422+
<< "pointer alignment checks.\n");
1035410423
}
1035510424

1035610425
if (LVL.hasStructVectorCall()) {
@@ -10508,8 +10577,19 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1050810577
unsigned SelectedIC = std::max(IC, UserIC);
1050910578
// Optimistically generate runtime checks if they are needed. Drop them if
1051010579
// they turn out to not be profitable.
10511-
if (VF.Width.isVector() || SelectedIC > 1)
10580+
if (VF.Width.isVector() || SelectedIC > 1) {
10581+
if (LVL.getNumPotentiallyFaultingLoads()) {
10582+
assert(SelectedIC == 1 &&
10583+
"Interleaving not supported for early exit loops and "
10584+
"potentially faulting loads");
10585+
assert(!CM.foldTailWithEVL() &&
10586+
"Explicit vector length unsupported for early exit loops and "
10587+
"potentially faulting loads");
10588+
addPointerAlignmentChecks(LVL.getPotentiallyFaultingLoads(), F, PSE,
10589+
TTI, VF.Width);
10590+
}
1051210591
Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10592+
}
1051310593

1051410594
// Check if it is profitable to vectorize with runtime checks.
1051510595
bool ForceVectorization =

0 commit comments

Comments
 (0)