Skip to content

Commit 50f8fbf

Browse files
committed
[LoopVectorize] Perform loop versioning for some early exit loops
When attempting to vectorise a loop with an uncountable early exit, we attempt to discover if all the loads in the loop are known to be dereferenceable. If at least one load could potentially fault then we abandon vectorisation. This patch adds support for vectorising loops with one potentially faulting load by versioning the loop based on the load pointer alignment. It is required that the vector load must always fault on the first lane, i.e. the load should not straddle a page boundary. Doing so ensures that the behaviour of the vector and scalar loops is identical, i.e. if a load does fault it will fault at the same scalar iteration. Such vectorisation depends on the following conditions being met: 1. The max vector width must not exceed the minimum page size. This is done by adding a getMaxSafeVectorWidthInBits wrapper that checks if we have an uncountable early exit. For scalable vectors we must be able to determine the maximum possible value of vscale. 2. The size of the loaded type must be a power of 2. This is checked during legalisation. 3. The VF must be a power of two (so that the vector width can divide wholly into the page size which is also power of 2). For fixed-width vectors this is always true, and for scalable vectors we query the TTI hook isVScaleKnownToBeAPowerOfTwo. If the effective runtime VF could change during the loop then this cannot be vectorised via loop versioning. 4. The load pointer must be aligned to a multiple of the vector width. (NOTE: interleaving is currently disabled for these early exit loops.) We add a runtime check to ensure this is true.
1 parent e30576f commit 50f8fbf

File tree

7 files changed

+922
-183
lines changed

7 files changed

+922
-183
lines changed

llvm/include/llvm/Analysis/Loads.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,8 @@ bool isDereferenceableAndAlignedInLoop(
9292
/// contains read-only memory accesses.
9393
bool isDereferenceableReadOnlyLoop(
9494
Loop *L, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
95-
SmallVectorImpl<const SCEVPredicate *> *Predicates = nullptr);
95+
SmallVectorImpl<const SCEVPredicate *> *Predicates = nullptr,
96+
SmallVectorImpl<LoadInst *> *NonDerefLoads = nullptr);
9697

9798
/// Return true if we know that executing a load from this value cannot trap.
9899
///

llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,19 @@ class LoopVectorizationLegality {
423423
unsigned getNumStores() const { return LAI->getNumStores(); }
424424
unsigned getNumLoads() const { return LAI->getNumLoads(); }
425425

426+
/// Return the number of loads in the loop that could potentially fault in a
427+
/// loop with uncountable early exits.
428+
unsigned getNumPotentiallyFaultingLoads() const {
429+
return PotentiallyFaultingLoads.size();
430+
}
431+
432+
/// Return a vector of all potentially faulting loads in a loop with
433+
/// uncountable early exits.
434+
const SmallVectorImpl<std::pair<LoadInst *, const SCEV *>> *
435+
getPotentiallyFaultingLoads() const {
436+
return &PotentiallyFaultingLoads;
437+
}
438+
426439
/// Returns a HistogramInfo* for the given instruction if it was determined
427440
/// to be part of a load -> update -> store sequence where multiple lanes
428441
/// may be working on the same memory address.
@@ -528,6 +541,11 @@ class LoopVectorizationLegality {
528541
/// additional cases safely.
529542
bool isVectorizableEarlyExitLoop();
530543

544+
/// Returns true if all loads in the loop contained in \p Loads can be
545+
/// analyzed as potentially faulting. Any loads that may fault are added to
546+
/// the member variable PotentiallyFaultingLoads.
547+
bool analyzePotentiallyFaultingLoads(SmallVectorImpl<LoadInst *> *Loads);
548+
531549
/// Return true if all of the instructions in the block can be speculatively
532550
/// executed, and record the loads/stores that require masking.
533551
/// \p SafePtrs is a list of addresses that are known to be legal and we know
@@ -652,6 +670,10 @@ class LoopVectorizationLegality {
652670
/// Keep track of the loop edge to an uncountable exit, comprising a pair
653671
/// of (Exiting, Exit) blocks, if there is exactly one early exit.
654672
std::optional<std::pair<BasicBlock *, BasicBlock *>> UncountableEdge;
673+
674+
/// Keep a record of all potentially faulting loads in loops with
675+
/// uncountable early exits.
676+
SmallVector<std::pair<LoadInst *, const SCEV *>, 4> PotentiallyFaultingLoads;
655677
};
656678

657679
} // namespace llvm

llvm/lib/Analysis/Loads.cpp

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -816,15 +816,26 @@ bool llvm::canReplacePointersIfEqual(const Value *From, const Value *To,
816816

817817
bool llvm::isDereferenceableReadOnlyLoop(
818818
Loop *L, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
819-
SmallVectorImpl<const SCEVPredicate *> *Predicates) {
819+
SmallVectorImpl<const SCEVPredicate *> *Predicates,
820+
SmallVectorImpl<LoadInst *> *NonDerefLoads) {
821+
bool Result = true;
820822
for (BasicBlock *BB : L->blocks()) {
821823
for (Instruction &I : *BB) {
822824
if (auto *LI = dyn_cast<LoadInst>(&I)) {
823-
if (!isDereferenceableAndAlignedInLoop(LI, L, *SE, *DT, AC, Predicates))
825+
if (!isDereferenceableAndAlignedInLoop(LI, L, *SE, *DT, AC,
826+
Predicates)) {
827+
if (!NonDerefLoads)
828+
return false;
829+
NonDerefLoads->push_back(LI);
830+
Result = false;
831+
}
832+
} else if (I.mayReadFromMemory() || I.mayWriteToMemory() ||
833+
I.mayThrow()) {
834+
if (!NonDerefLoads)
824835
return false;
825-
} else if (I.mayReadFromMemory() || I.mayWriteToMemory() || I.mayThrow())
826-
return false;
836+
Result = false;
837+
}
827838
}
828839
}
829-
return true;
840+
return Result;
830841
}

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

Lines changed: 48 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1609,6 +1609,43 @@ bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
16091609
return Result;
16101610
}
16111611

1612+
bool LoopVectorizationLegality::analyzePotentiallyFaultingLoads(
1613+
SmallVectorImpl<LoadInst *> *Loads) {
1614+
LLVM_DEBUG(dbgs() << "LV: Looking for potentially faulting loads in loop "
1615+
"with uncountable early exit:\n");
1616+
for (LoadInst *LI : *Loads) {
1617+
LLVM_DEBUG(dbgs() << "LV: Load: " << *LI << '\n');
1618+
Value *Ptr = LI->getPointerOperand();
1619+
if (!Ptr)
1620+
return false;
1621+
const SCEV *PtrExpr = PSE.getSCEV(Ptr);
1622+
const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrExpr);
1623+
// TODO: Deal with loop invariant pointers.
1624+
if (!AR || AR->getLoop() != TheLoop || !AR->isAffine())
1625+
return false;
1626+
auto Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*PSE.getSE()));
1627+
if (!Step)
1628+
return false;
1629+
const SCEV *Start = AR->getStart();
1630+
1631+
// Make sure the step is positive and matches the object size in memory.
1632+
// TODO: Extend this to cover more cases.
1633+
auto &DL = LI->getDataLayout();
1634+
APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getType()),
1635+
DL.getTypeStoreSize(LI->getType()).getFixedValue());
1636+
1637+
// Also discard element sizes that are not a power of 2, since the loop
1638+
// vectorizer can only perform loop versioning with pointer alignment
1639+
// checks for vector loads that are power-of-2 in size.
1640+
if (EltSize != Step->getAPInt() || !EltSize.isPowerOf2())
1641+
return false;
1642+
1643+
LLVM_DEBUG(dbgs() << "LV: SCEV for Load Ptr: " << *Start << '\n');
1644+
PotentiallyFaultingLoads.push_back({LI, Start});
1645+
}
1646+
return true;
1647+
}
1648+
16121649
bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
16131650
BasicBlock *LatchBB = TheLoop->getLoopLatch();
16141651
if (!LatchBB) {
@@ -1735,15 +1772,18 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
17351772
assert(LatchBB->getUniquePredecessor() == SingleUncountableEdge->first &&
17361773
"Expected latch predecessor to be the early exiting block");
17371774

1738-
// TODO: Handle loops that may fault.
17391775
Predicates.clear();
1740-
if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC,
1741-
&Predicates)) {
1742-
reportVectorizationFailure(
1743-
"Loop may fault",
1744-
"Cannot vectorize potentially faulting early exit loop",
1745-
"PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
1746-
return false;
1776+
SmallVector<LoadInst *, 4> Loads;
1777+
if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC, &Predicates,
1778+
&Loads)) {
1779+
if (!TTI->getMinPageSize() || !analyzePotentiallyFaultingLoads(&Loads)) {
1780+
reportVectorizationFailure(
1781+
"Loop may fault",
1782+
"Cannot vectorize potentially faulting early exit loop",
1783+
"PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
1784+
return false;
1785+
}
1786+
LLVM_DEBUG(dbgs() << "We can vectorize the loop with runtime checks.\n");
17471787
}
17481788

17491789
[[maybe_unused]] const SCEV *SymbolicMaxBTC =

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 101 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -400,6 +400,12 @@ static cl::opt<bool> EnableEarlyExitVectorization(
400400
cl::desc(
401401
"Enable vectorization of early exit loops with uncountable exits."));
402402

403+
static cl::opt<unsigned> MaxNumPotentiallyFaultingPointers(
404+
"max-num-faulting-pointers", cl::init(1), cl::Hidden,
405+
cl::desc(
406+
"The maximum number of potentially faulting pointers we permit when "
407+
"vectorizing loops with uncountable exits."));
408+
403409
// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
404410
// variables not overflowing do not hold. See `emitSCEVChecks`.
405411
static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
@@ -1585,6 +1591,22 @@ class LoopVectorizationCostModel {
15851591
ElementCount MaxSafeVF,
15861592
bool FoldTailByMasking);
15871593

1594+
bool isSafeForAnyVectorWidth() const {
1595+
return Legal->isSafeForAnyVectorWidth() &&
1596+
(!Legal->hasUncountableEarlyExit() ||
1597+
!Legal->getNumPotentiallyFaultingLoads());
1598+
}
1599+
1600+
uint64_t getMaxSafeVectorWidthInBits() const {
1601+
uint64_t MaxSafeVectorWidth = Legal->getMaxSafeVectorWidthInBits();
1602+
// The legalizer bails out if getMinPageSize does not return a value.
1603+
if (Legal->hasUncountableEarlyExit() &&
1604+
Legal->getNumPotentiallyFaultingLoads())
1605+
MaxSafeVectorWidth =
1606+
std::min(MaxSafeVectorWidth, uint64_t(*TTI.getMinPageSize()) * 8);
1607+
return MaxSafeVectorWidth;
1608+
}
1609+
15881610
/// Checks if scalable vectorization is supported and enabled. Caches the
15891611
/// result to avoid repeated debug dumps for repeated queries.
15901612
bool isScalableVectorizationAllowed();
@@ -2133,6 +2155,41 @@ class GeneratedRTChecks {
21332155
};
21342156
} // namespace
21352157

2158+
std::optional<unsigned> getMaxVScale(const Function &F,
2159+
const TargetTransformInfo &TTI) {
2160+
if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2161+
return MaxVScale;
2162+
2163+
if (F.hasFnAttribute(Attribute::VScaleRange))
2164+
return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2165+
2166+
return std::nullopt;
2167+
}
2168+
2169+
static void addPointerAlignmentChecks(
2170+
const SmallVectorImpl<std::pair<LoadInst *, const SCEV *>> *Loads,
2171+
Function *F, PredicatedScalarEvolution &PSE, TargetTransformInfo *TTI,
2172+
ElementCount VF) {
2173+
ScalarEvolution *SE = PSE.getSE();
2174+
const DataLayout &DL = SE->getDataLayout();
2175+
Type *PtrIntType = DL.getIntPtrType(SE->getContext());
2176+
2177+
const SCEV *Zero = SE->getZero(PtrIntType);
2178+
const SCEV *ScevEC = SE->getElementCount(PtrIntType, VF);
2179+
2180+
for (auto Load : *Loads) {
2181+
APInt EltSize(
2182+
DL.getIndexTypeSizeInBits(Load.first->getPointerOperandType()),
2183+
DL.getTypeStoreSize(Load.first->getType()).getFixedValue());
2184+
const SCEV *Start = SE->getPtrToIntExpr(Load.second, PtrIntType);
2185+
const SCEV *Align =
2186+
SE->getMulExpr(ScevEC, SE->getConstant(EltSize),
2187+
(SCEV::NoWrapFlags)(SCEV::FlagNSW | SCEV::FlagNUW));
2188+
const SCEV *Rem = SE->getURemExpr(Start, Align);
2189+
PSE.addPredicate(*(SE->getEqualPredicate(Rem, Zero)));
2190+
}
2191+
}
2192+
21362193
static bool useActiveLaneMask(TailFoldingStyle Style) {
21372194
return Style == TailFoldingStyle::Data ||
21382195
Style == TailFoldingStyle::DataAndControlFlow ||
@@ -2302,17 +2359,6 @@ emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
23022359
llvm_unreachable("invalid enum");
23032360
}
23042361

2305-
std::optional<unsigned> getMaxVScale(const Function &F,
2306-
const TargetTransformInfo &TTI) {
2307-
if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2308-
return MaxVScale;
2309-
2310-
if (F.hasFnAttribute(Attribute::VScaleRange))
2311-
return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2312-
2313-
return std::nullopt;
2314-
}
2315-
23162362
/// For the given VF and UF and maximum trip count computed for the loop, return
23172363
/// whether the induction variable might overflow in the vectorized loop. If not,
23182364
/// then we know a runtime overflow check always evaluates to false and can be
@@ -3796,13 +3842,22 @@ bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
37963842
return false;
37973843
}
37983844

3799-
if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) {
3845+
if (!isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) {
38003846
reportVectorizationInfo("The target does not provide maximum vscale value "
38013847
"for safe distance analysis.",
38023848
"ScalableVFUnfeasible", ORE, TheLoop);
38033849
return false;
38043850
}
38053851

3852+
if (Legal->hasUncountableEarlyExit() &&
3853+
Legal->getNumPotentiallyFaultingLoads() &&
3854+
!TTI.isVScaleKnownToBeAPowerOfTwo()) {
3855+
reportVectorizationInfo("Cannot vectorize potentially faulting early exit "
3856+
"loop with scalable vectors.",
3857+
"ScalableVFUnfeasible", ORE, TheLoop);
3858+
return false;
3859+
}
3860+
38063861
IsScalableVectorizationAllowed = true;
38073862
return true;
38083863
}
@@ -3814,7 +3869,7 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
38143869

38153870
auto MaxScalableVF = ElementCount::getScalable(
38163871
std::numeric_limits<ElementCount::ScalarTy>::max());
3817-
if (Legal->isSafeForAnyVectorWidth())
3872+
if (isSafeForAnyVectorWidth())
38183873
return MaxScalableVF;
38193874

38203875
std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
@@ -3841,11 +3896,11 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
38413896
// the memory accesses that is most restrictive (involved in the smallest
38423897
// dependence distance).
38433898
unsigned MaxSafeElements =
3844-
llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
3899+
llvm::bit_floor(getMaxSafeVectorWidthInBits() / WidestType);
38453900

38463901
auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
38473902
auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
3848-
if (!Legal->isSafeForAnyVectorWidth())
3903+
if (!isSafeForAnyVectorWidth())
38493904
this->MaxSafeElements = MaxSafeElements;
38503905

38513906
LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
@@ -10380,11 +10435,25 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1038010435
return false;
1038110436
}
1038210437

10383-
if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) {
10384-
reportVectorizationFailure("Auto-vectorization of loops with uncountable "
10385-
"early exit is not enabled",
10386-
"UncountableEarlyExitLoopsDisabled", ORE, L);
10387-
return false;
10438+
if (LVL.hasUncountableEarlyExit()) {
10439+
if (!EnableEarlyExitVectorization) {
10440+
reportVectorizationFailure("Auto-vectorization of loops with uncountable "
10441+
"early exit is not enabled",
10442+
"UncountableEarlyExitLoopsDisabled", ORE, L);
10443+
return false;
10444+
}
10445+
10446+
unsigned NumPotentiallyFaultingPointers =
10447+
LVL.getNumPotentiallyFaultingLoads();
10448+
if (NumPotentiallyFaultingPointers > MaxNumPotentiallyFaultingPointers) {
10449+
reportVectorizationFailure("Not worth vectorizing loop with uncountable "
10450+
"early exit, due to number of potentially "
10451+
"faulting loads",
10452+
"UncountableEarlyExitMayFault", ORE, L);
10453+
return false;
10454+
} else if (NumPotentiallyFaultingPointers)
10455+
LLVM_DEBUG(dbgs() << "LV: Need to version early-exit vector loop with "
10456+
<< "pointer alignment checks.\n");
1038810457
}
1038910458

1039010459
if (LVL.hasStructVectorCall()) {
@@ -10542,8 +10611,19 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1054210611
unsigned SelectedIC = std::max(IC, UserIC);
1054310612
// Optimistically generate runtime checks if they are needed. Drop them if
1054410613
// they turn out to not be profitable.
10545-
if (VF.Width.isVector() || SelectedIC > 1)
10614+
if (VF.Width.isVector() || SelectedIC > 1) {
10615+
if (LVL.getNumPotentiallyFaultingLoads()) {
10616+
assert(SelectedIC == 1 &&
10617+
"Interleaving not supported for early exit loops and "
10618+
"potentially faulting loads");
10619+
assert(!CM.foldTailWithEVL() &&
10620+
"Explicit vector length unsupported for early exit loops and "
10621+
"potentially faulting loads");
10622+
addPointerAlignmentChecks(LVL.getPotentiallyFaultingLoads(), F, PSE,
10623+
TTI, VF.Width);
10624+
}
1054610625
Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10626+
}
1054710627

1054810628
// Check if it is profitable to vectorize with runtime checks.
1054910629
bool ForceVectorization =

0 commit comments

Comments
 (0)