Skip to content

Commit e0e8ca8

Browse files
committed
[WIP][LoopVectorize] Perform loop versioning for some early exit loops
1 parent beea5ac commit e0e8ca8

File tree

7 files changed

+207
-24
lines changed

7 files changed

+207
-24
lines changed

llvm/include/llvm/Analysis/Loads.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,8 @@ bool isDereferenceableAndAlignedInLoop(
9292
/// contains read-only memory accesses.
9393
bool isDereferenceableReadOnlyLoop(
9494
Loop *L, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
95-
SmallVectorImpl<const SCEVPredicate *> *Predicates = nullptr);
95+
SmallVectorImpl<const SCEVPredicate *> *Predicates = nullptr,
96+
SmallVectorImpl<LoadInst *> *NonDerefLoads = nullptr);
9697

9798
/// Return true if we know that executing a load from this value cannot trap.
9899
///

llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,19 @@ class LoopVectorizationLegality {
425425
unsigned getNumStores() const { return LAI->getNumStores(); }
426426
unsigned getNumLoads() const { return LAI->getNumLoads(); }
427427

428+
/// Return the number of loads in the loop we have to consider that could
429+
/// potentially fault in a loop with uncountable early exits.
430+
unsigned getNumPotentiallyFaultingLoads() const {
431+
return PotentiallyFaultingLoads.size();
432+
}
433+
434+
/// Return a vector of all potentially faulting loads in a loop with
435+
/// uncountable early exits.
436+
const SmallVectorImpl<std::pair<LoadInst *, const SCEV *>> *
437+
getPotentiallyFaultingLoads() const {
438+
return &PotentiallyFaultingLoads;
439+
}
440+
428441
/// Returns a HistogramInfo* for the given instruction if it was determined
429442
/// to be part of a load -> update -> store sequence where multiple lanes
430443
/// may be working on the same memory address.
@@ -533,6 +546,8 @@ class LoopVectorizationLegality {
533546
/// additional cases safely.
534547
bool isVectorizableEarlyExitLoop();
535548

549+
bool analyzePotentiallyFaultingLoads(SmallVectorImpl<LoadInst *> *Loads);
550+
536551
/// Return true if all of the instructions in the block can be speculatively
537552
/// executed, and record the loads/stores that require masking.
538553
/// \p SafePtrs is a list of addresses that are known to be legal and we know
@@ -656,6 +671,10 @@ class LoopVectorizationLegality {
656671
/// Keep track of the destinations of all uncountable exits if the
657672
/// exact backedge taken count is not computable.
658673
SmallVector<BasicBlock *, 4> UncountableExitBlocks;
674+
675+
/// Keep a record of all potentially faulting loads in loops with
676+
/// uncountable early exits.
677+
SmallVector<std::pair<LoadInst *, const SCEV *>, 4> PotentiallyFaultingLoads;
659678
};
660679

661680
} // namespace llvm

llvm/lib/Analysis/Loads.cpp

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -808,15 +808,26 @@ bool llvm::canReplacePointersIfEqual(const Value *From, const Value *To,
808808

809809
bool llvm::isDereferenceableReadOnlyLoop(
810810
Loop *L, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
811-
SmallVectorImpl<const SCEVPredicate *> *Predicates) {
811+
SmallVectorImpl<const SCEVPredicate *> *Predicates,
812+
SmallVectorImpl<LoadInst *> *NonDerefLoads) {
813+
bool Result = true;
812814
for (BasicBlock *BB : L->blocks()) {
813815
for (Instruction &I : *BB) {
814816
if (auto *LI = dyn_cast<LoadInst>(&I)) {
815-
if (!isDereferenceableAndAlignedInLoop(LI, L, *SE, *DT, AC, Predicates))
817+
if (!isDereferenceableAndAlignedInLoop(LI, L, *SE, *DT, AC,
818+
Predicates)) {
819+
if (!NonDerefLoads)
820+
return false;
821+
NonDerefLoads->push_back(LI);
822+
Result = false;
823+
}
824+
} else if (I.mayReadFromMemory() || I.mayWriteToMemory() ||
825+
I.mayThrow()) {
826+
if (!NonDerefLoads)
816827
return false;
817-
} else if (I.mayReadFromMemory() || I.mayWriteToMemory() || I.mayThrow())
818-
return false;
828+
Result = false;
829+
}
819830
}
820831
}
821-
return true;
832+
return Result;
822833
}

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

Lines changed: 44 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1589,6 +1589,39 @@ bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
15891589
return Result;
15901590
}
15911591

1592+
bool LoopVectorizationLegality::analyzePotentiallyFaultingLoads(
1593+
SmallVectorImpl<LoadInst *> *Loads) {
1594+
LLVM_DEBUG(dbgs() << "Found potentially faulting loads in loop with "
1595+
"uncountable early exit:\n");
1596+
for (LoadInst *LI : *Loads) {
1597+
LLVM_DEBUG(dbgs() << "Load: " << *LI << '\n');
1598+
Value *Ptr = LI->getPointerOperand();
1599+
if (!Ptr)
1600+
return false;
1601+
const SCEV *PtrExpr = PSE.getSCEV(Ptr);
1602+
const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrExpr);
1603+
// TODO: Deal with loop invariant pointers.
1604+
if (!AR || AR->getLoop() != TheLoop || !AR->isAffine())
1605+
return false;
1606+
auto Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*PSE.getSE()));
1607+
if (!Step)
1608+
return false;
1609+
const SCEV *Start = AR->getStart();
1610+
1611+
// Make sure the step is positive and matches the object size in memory.
1612+
// TODO: Extend this to cover more cases.
1613+
auto &DL = LI->getDataLayout();
1614+
APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getType()),
1615+
DL.getTypeStoreSize(LI->getType()).getFixedValue());
1616+
if (EltSize != Step->getAPInt())
1617+
return false;
1618+
1619+
LLVM_DEBUG(dbgs() << "SCEV for Load Ptr: " << *Start << 'n');
1620+
PotentiallyFaultingLoads.push_back({LI, Start});
1621+
}
1622+
return true;
1623+
}
1624+
15921625
bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
15931626
BasicBlock *LatchBB = TheLoop->getLoopLatch();
15941627
if (!LatchBB) {
@@ -1713,15 +1746,18 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
17131746
assert(LatchBB->getUniquePredecessor() == getUncountableEarlyExitingBlock() &&
17141747
"Expected latch predecessor to be the early exiting block");
17151748

1716-
// TODO: Handle loops that may fault.
17171749
Predicates.clear();
1718-
if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC,
1719-
&Predicates)) {
1720-
reportVectorizationFailure(
1721-
"Loop may fault",
1722-
"Cannot vectorize potentially faulting early exit loop",
1723-
"PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
1724-
return false;
1750+
SmallVector<LoadInst *, 4> Loads;
1751+
if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC, &Predicates,
1752+
&Loads)) {
1753+
if (!analyzePotentiallyFaultingLoads(&Loads)) {
1754+
reportVectorizationFailure(
1755+
"Loop may fault",
1756+
"Cannot vectorize potentially faulting early exit loop",
1757+
"PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
1758+
return false;
1759+
}
1760+
LLVM_DEBUG(dbgs() << "We can vectorize the loop with runtime checks.\n");
17251761
}
17261762

17271763
[[maybe_unused]] const SCEV *SymbolicMaxBTC =

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 47 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2135,6 +2135,29 @@ class GeneratedRTChecks {
21352135
};
21362136
} // namespace
21372137

2138+
static void addPointerAlignmentChecks(
2139+
const SmallVectorImpl<std::pair<LoadInst *, const SCEV *>> *Loads,
2140+
PredicatedScalarEvolution &PSE, ElementCount VF) {
2141+
ScalarEvolution *SE = PSE.getSE();
2142+
const DataLayout &DL = SE->getDataLayout();
2143+
Type *PtrIntType = DL.getIntPtrType(SE->getContext());
2144+
2145+
const SCEV *Zero = SE->getZero(PtrIntType);
2146+
const SCEV *ScevEC = SE->getElementCount(PtrIntType, VF);
2147+
2148+
for (auto Load : *Loads) {
2149+
APInt EltSize(
2150+
DL.getIndexTypeSizeInBits(Load.first->getPointerOperandType()),
2151+
DL.getTypeStoreSize(Load.first->getType()).getFixedValue());
2152+
const SCEV *Start = SE->getPtrToIntExpr(Load.second, PtrIntType);
2153+
const SCEV *Align =
2154+
SE->getMulExpr(ScevEC, SE->getConstant(EltSize),
2155+
(SCEV::NoWrapFlags)(SCEV::FlagNSW | SCEV::FlagNUW));
2156+
const SCEV *Rem = SE->getURemExpr(Start, Align);
2157+
PSE.addPredicate(*(SE->getEqualPredicate(Rem, Zero)));
2158+
}
2159+
}
2160+
21382161
static bool useActiveLaneMask(TailFoldingStyle Style) {
21392162
return Style == TailFoldingStyle::Data ||
21402163
Style == TailFoldingStyle::DataAndControlFlow ||
@@ -10236,11 +10259,25 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1023610259
return false;
1023710260
}
1023810261

10239-
if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) {
10240-
reportVectorizationFailure("Auto-vectorization of loops with uncountable "
10241-
"early exit is not enabled",
10242-
"UncountableEarlyExitLoopsDisabled", ORE, L);
10243-
return false;
10262+
if (LVL.hasUncountableEarlyExit()) {
10263+
if (!EnableEarlyExitVectorization) {
10264+
reportVectorizationFailure("Auto-vectorization of loops with uncountable "
10265+
"early exit is not enabled",
10266+
"UncountableEarlyExitLoopsDisabled", ORE, L);
10267+
return false;
10268+
}
10269+
10270+
unsigned NumPotentiallyFaultingPointers =
10271+
LVL.getNumPotentiallyFaultingLoads();
10272+
if (NumPotentiallyFaultingPointers > 1) {
10273+
reportVectorizationFailure("Not worth vectorizing loop with uncountable "
10274+
"early exit, due to number of potentially "
10275+
"faulting loads",
10276+
"UncountableEarlyExitMayFault", ORE, L);
10277+
return false;
10278+
} else if (NumPotentiallyFaultingPointers)
10279+
LLVM_DEBUG(dbgs() << "LV: Need to version early-exit vector loop with"
10280+
<< "pointer alignment checks.\n");
1024410281
}
1024510282

1024610283
// Entrance to the VPlan-native vectorization path. Outer loops are processed
@@ -10391,8 +10428,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1039110428
unsigned SelectedIC = std::max(IC, UserIC);
1039210429
// Optimistically generate runtime checks if they are needed. Drop them if
1039310430
// they turn out to not be profitable.
10394-
if (VF.Width.isVector() || SelectedIC > 1)
10431+
if (VF.Width.isVector() || SelectedIC > 1) {
10432+
if (LVL.getNumPotentiallyFaultingLoads())
10433+
addPointerAlignmentChecks(LVL.getPotentiallyFaultingLoads(), PSE,
10434+
VF.Width);
1039510435
Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10436+
}
1039610437

1039710438
// Check if it is profitable to vectorize with runtime checks.
1039810439
bool ForceVectorization =

llvm/test/Transforms/LoopVectorize/early_exit_legality.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ loop.end:
208208

209209
define i64 @same_exit_block_pre_inc_use1_too_small_allocas() {
210210
; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_too_small_allocas'
211-
; CHECK: LV: Not vectorizing: Loop may fault.
211+
; CHECK: LV: Not vectorizing: Not worth vectorizing loop with uncountable early exit, due to number of potentially faulting loads.
212212
entry:
213213
%p1 = alloca [42 x i8]
214214
%p2 = alloca [42 x i8]
@@ -238,7 +238,7 @@ loop.end:
238238

239239
define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs(ptr dereferenceable(42) %p1, ptr dereferenceable(42) %p2) {
240240
; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_too_small_deref_ptrs'
241-
; CHECK: LV: Not vectorizing: Loop may fault.
241+
; CHECK: LV: Not vectorizing: Not worth vectorizing loop with uncountable early exit, due to number of potentially faulting loads.
242242
entry:
243243
br label %loop
244244

@@ -264,7 +264,7 @@ loop.end:
264264

265265
define i64 @same_exit_block_pre_inc_use1_unknown_ptrs(ptr %p1, ptr %p2) {
266266
; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_unknown_ptrs'
267-
; CHECK: LV: Not vectorizing: Loop may fault.
267+
; CHECK: LV: Not vectorizing: Not worth vectorizing loop with uncountable early exit, due to number of potentially faulting loads.
268268
entry:
269269
br label %loop
270270

llvm/test/Transforms/LoopVectorize/single_early_exit_unsafe_ptrs.ll

Lines changed: 76 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
2-
; RUN: opt -S < %s -p loop-vectorize | FileCheck %s
2+
; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization | FileCheck %s
33

44
declare void @init_mem(ptr, i64);
55

@@ -141,3 +141,78 @@ loop.end:
141141
%retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
142142
ret i64 %retval
143143
}
144+
145+
define i64 @same_exit_block_pre_inc_use1_unknown_single_ptr(ptr %p1) {
146+
; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_unknown_single_ptr(
147+
; CHECK-SAME: ptr [[P1:%.*]]) {
148+
; CHECK-NEXT: entry:
149+
; CHECK-NEXT: [[P11:%.*]] = ptrtoint ptr [[P1]] to i64
150+
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
151+
; CHECK: vector.scevcheck:
152+
; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[P11]] to i2
153+
; CHECK-NEXT: [[TMP1:%.*]] = add i2 [[TMP0]], -1
154+
; CHECK-NEXT: [[TMP2:%.*]] = zext i2 [[TMP1]] to i64
155+
; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP2]], 0
156+
; CHECK-NEXT: br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
157+
; CHECK: vector.ph:
158+
; CHECK-NEXT: br label [[LOOP:%.*]]
159+
; CHECK: vector.body:
160+
; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[LOOP]] ]
161+
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX2]]
162+
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0
163+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP3]]
164+
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
165+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
166+
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], splat (i8 3)
167+
; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX2]], 4
168+
; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i1> [[TMP6]], splat (i1 true)
169+
; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]])
170+
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
171+
; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]]
172+
; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
173+
; CHECK: middle.split:
174+
; CHECK-NEXT: br i1 [[TMP8]], label [[LOOP_END:%.*]], label [[MIDDLE_BLOCK:%.*]]
175+
; CHECK: middle.block:
176+
; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
177+
; CHECK: scalar.ph:
178+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[VECTOR_SCEVCHECK]] ], [ 3, [[ENTRY:%.*]] ]
179+
; CHECK-NEXT: br label [[LOOP1:%.*]]
180+
; CHECK: loop:
181+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
182+
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
183+
; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
184+
; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], 3
185+
; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
186+
; CHECK: loop.inc:
187+
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
188+
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
189+
; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP3:![0-9]+]]
190+
; CHECK: loop.end:
191+
; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 1, [[LOOP1]] ], [ 0, [[LOOP_INC]] ], [ 0, [[MIDDLE_BLOCK]] ], [ 1, [[MIDDLE_SPLIT]] ]
192+
; CHECK-NEXT: ret i64 [[RETVAL]]
193+
;
194+
entry:
195+
br label %loop
196+
197+
loop:
198+
%index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
199+
%arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
200+
%ld1 = load i8, ptr %arrayidx, align 1
201+
%cmp3 = icmp eq i8 %ld1, 3
202+
br i1 %cmp3, label %loop.inc, label %loop.end
203+
204+
loop.inc:
205+
%index.next = add i64 %index, 1
206+
%exitcond = icmp ne i64 %index.next, 67
207+
br i1 %exitcond, label %loop, label %loop.end
208+
209+
loop.end:
210+
%retval = phi i64 [ 1, %loop ], [ 0, %loop.inc ]
211+
ret i64 %retval
212+
}
213+
;.
214+
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
215+
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
216+
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
217+
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
218+
;.

0 commit comments

Comments
 (0)