Skip to content

Commit 07b3301

Browse files
authored
[VPlan] Model FOR extract of exit value in VPlan. (#93395)
This patch introduces a new ExtractFromEnd VPInstruction opcode to extract the value of a FOR for users outside the loop (i.e. in the scalar loop's exits). This moves the first part of fixing first order recurrences to VPlan, and removes some additional code to patch up live-outs, which is now handled automatically. The majority of test changes is due to changes in the order of which the extracts are generated now. As we are now using VPTransformState to generate the extracts, we may be able to re-use existing extracts in the loop body in some cases. For scalable vectors, in some cases we now have to compute the runtime VF twice, as each extract is now independent, but those should be trivial to clean up for later passes (and in line with other places in the code that also liberally re-compute runtime VFs). PR: #93395
1 parent 0525c20 commit 07b3301

11 files changed

+130
-100
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 0 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -3536,44 +3536,6 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence(
35363536
Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract");
35373537
}
35383538

3539-
auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin());
3540-
assert(PhiR->getNumUsers() == 1 &&
3541-
RecurSplice->getOpcode() ==
3542-
VPInstruction::FirstOrderRecurrenceSplice &&
3543-
"recurrence phi must have a single user: FirstOrderRecurrenceSplice");
3544-
SmallVector<VPLiveOut *> LiveOuts;
3545-
for (VPUser *U : RecurSplice->users())
3546-
if (auto *LiveOut = dyn_cast<VPLiveOut>(U))
3547-
LiveOuts.push_back(LiveOut);
3548-
3549-
if (!LiveOuts.empty()) {
3550-
// Extract the second last element in the middle block if the
3551-
// Phi is used outside the loop. We need to extract the phi itself
3552-
// and not the last element (the phi update in the current iteration). This
3553-
// will be the value when jumping to the exit block from the
3554-
// LoopMiddleBlock, when the scalar loop is not run at all.
3555-
Value *ExtractForPhiUsedOutsideLoop = nullptr;
3556-
if (VF.isVector()) {
3557-
auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3558-
ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3559-
Incoming, Idx, "vector.recur.extract.for.phi");
3560-
} else {
3561-
assert(UF > 1 && "VF and UF cannot both be 1");
3562-
// When loop is unrolled without vectorizing, initialize
3563-
// ExtractForPhiUsedOutsideLoop with the value just prior to unrolled
3564-
// value of `Incoming`. This is analogous to the vectorized case above:
3565-
// extracting the second last element when VF > 1.
3566-
ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3567-
}
3568-
3569-
for (VPLiveOut *LiveOut : LiveOuts) {
3570-
assert(!Cost->requiresScalarEpilogue(VF.isVector()));
3571-
PHINode *LCSSAPhi = LiveOut->getPhi();
3572-
LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3573-
State.Plan->removeLiveOut(LCSSAPhi);
3574-
}
3575-
}
3576-
35773539
// Fix the initial value of the original recurrence in the scalar loop.
35783540
Builder.SetInsertPoint(LoopScalarPreHeader, LoopScalarPreHeader->begin());
35793541
PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -167,8 +167,10 @@ class VPLane {
167167

168168
static VPLane getFirstLane() { return VPLane(0, VPLane::Kind::First); }
169169

170-
static VPLane getLastLaneForVF(const ElementCount &VF) {
171-
unsigned LaneOffset = VF.getKnownMinValue() - 1;
170+
static VPLane getLaneFromEnd(const ElementCount &VF, unsigned Offset) {
171+
assert(Offset > 0 && Offset <= VF.getKnownMinValue() &&
172+
"trying to extract with invalid offset");
173+
unsigned LaneOffset = VF.getKnownMinValue() - Offset;
172174
Kind LaneKind;
173175
if (VF.isScalable())
174176
// In this case 'LaneOffset' refers to the offset from the start of the
@@ -179,6 +181,10 @@ class VPLane {
179181
return VPLane(LaneOffset, LaneKind);
180182
}
181183

184+
static VPLane getLastLaneForVF(const ElementCount &VF) {
185+
return getLaneFromEnd(VF, 1);
186+
}
187+
182188
/// Returns a compile-time known value for the lane index and asserts if the
183189
/// lane can only be calculated at runtime.
184190
unsigned getKnownLane() const {
@@ -1182,6 +1188,12 @@ class VPInstruction : public VPRecipeWithIRFlags {
11821188
BranchOnCount,
11831189
BranchOnCond,
11841190
ComputeReductionResult,
1191+
// Takes the VPValue to extract from as first operand and the lane or part
1192+
// to extract as second operand, counting from the end starting with 1 for
1193+
// last. The second operand must be a positive constant and <= VF when
1194+
// extracting from a vector or <= UF when extracting from an unrolled
1195+
// scalar.
1196+
ExtractFromEnd,
11851197
LogicalAnd, // Non-poison propagating logical And.
11861198
// Add an offset in bytes (second operand) to a base pointer (first
11871199
// operand). Only generates scalar values (either for the first lane only or
@@ -1327,6 +1339,10 @@ class VPInstruction : public VPRecipeWithIRFlags {
13271339
};
13281340
llvm_unreachable("switch should return");
13291341
}
1342+
1343+
/// Returns true if this VPInstruction produces a scalar value from a vector,
1344+
/// e.g. by performing a reduction or extracting a lane.
1345+
bool isVectorToScalar() const;
13301346
};
13311347

13321348
/// VPWidenRecipe is a recipe for producing a copy of vector type its
@@ -3657,7 +3673,7 @@ inline bool isUniformAfterVectorization(VPValue *VPV) {
36573673
if (auto *GEP = dyn_cast<VPWidenGEPRecipe>(Def))
36583674
return all_of(GEP->operands(), isUniformAfterVectorization);
36593675
if (auto *VPI = dyn_cast<VPInstruction>(Def))
3660-
return VPI->getOpcode() == VPInstruction::ComputeReductionResult;
3676+
return VPI->isVectorToScalar();
36613677
return false;
36623678
}
36633679
} // end namespace vputils

llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,12 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
4545
CachedTypes[OtherV] = ResTy;
4646
return ResTy;
4747
}
48+
case VPInstruction::ExtractFromEnd: {
49+
Type *BaseTy = inferScalarType(R->getOperand(0));
50+
if (auto *VecTy = dyn_cast<VectorType>(BaseTy))
51+
return VecTy->getElementType();
52+
return BaseTy;
53+
}
4854
case VPInstruction::Not: {
4955
Type *ResTy = inferScalarType(R->getOperand(0));
5056
assert(IntegerType::get(Ctx, 1) == ResTy &&

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
137137
case VPInstruction::Not:
138138
case VPInstruction::CalculateTripCountMinusVF:
139139
case VPInstruction::CanonicalIVIncrementForPart:
140+
case VPInstruction::ExtractFromEnd:
140141
case VPInstruction::LogicalAnd:
141142
case VPInstruction::PtrAdd:
142143
return false;
@@ -293,13 +294,13 @@ bool VPInstruction::doesGeneratePerAllLanes() const {
293294
bool VPInstruction::canGenerateScalarForFirstLane() const {
294295
if (Instruction::isBinaryOp(getOpcode()))
295296
return true;
296-
297+
if (isVectorToScalar())
298+
return true;
297299
switch (Opcode) {
298300
case VPInstruction::BranchOnCond:
299301
case VPInstruction::BranchOnCount:
300302
case VPInstruction::CalculateTripCountMinusVF:
301303
case VPInstruction::CanonicalIVIncrementForPart:
302-
case VPInstruction::ComputeReductionResult:
303304
case VPInstruction::PtrAdd:
304305
case VPInstruction::ExplicitVectorLength:
305306
return true;
@@ -558,6 +559,29 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
558559

559560
return ReducedPartRdx;
560561
}
562+
case VPInstruction::ExtractFromEnd: {
563+
if (Part != 0)
564+
return State.get(this, 0, /*IsScalar*/ true);
565+
566+
auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
567+
unsigned Offset = CI->getZExtValue();
568+
assert(Offset > 0 && "Offset from end must be positive");
569+
Value *Res;
570+
if (State.VF.isVector()) {
571+
assert(Offset <= State.VF.getKnownMinValue() &&
572+
"invalid offset to extract from");
573+
// Extract lane VF - Offset from the operand.
574+
Res = State.get(
575+
getOperand(0),
576+
VPIteration(State.UF - 1, VPLane::getLaneFromEnd(State.VF, Offset)));
577+
} else {
578+
assert(Offset <= State.UF && "invalid offset to extract from");
579+
// When loop is unrolled without vectorizing, retrieve UF - Offset.
580+
Res = State.get(getOperand(0), State.UF - Offset);
581+
}
582+
Res->setName(Name);
583+
return Res;
584+
}
561585
case VPInstruction::LogicalAnd: {
562586
Value *A = State.get(getOperand(0), Part);
563587
Value *B = State.get(getOperand(1), Part);
@@ -575,6 +599,11 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
575599
}
576600
}
577601

602+
bool VPInstruction::isVectorToScalar() const {
603+
return getOpcode() == VPInstruction::ExtractFromEnd ||
604+
getOpcode() == VPInstruction::ComputeReductionResult;
605+
}
606+
578607
#if !defined(NDEBUG)
579608
bool VPInstruction::isFPMathOp() const {
580609
// Inspired by FPMathOperator::classof. Notable differences are that we don't
@@ -597,8 +626,7 @@ void VPInstruction::execute(VPTransformState &State) {
597626
State.setDebugLocFrom(getDebugLoc());
598627
bool GeneratesPerFirstLaneOnly =
599628
canGenerateScalarForFirstLane() &&
600-
(vputils::onlyFirstLaneUsed(this) ||
601-
getOpcode() == VPInstruction::ComputeReductionResult);
629+
(vputils::onlyFirstLaneUsed(this) || isVectorToScalar());
602630
bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
603631
for (unsigned Part = 0; Part < State.UF; ++Part) {
604632
if (GeneratesPerAllLanes) {
@@ -692,6 +720,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
692720
case VPInstruction::BranchOnCount:
693721
O << "branch-on-count";
694722
break;
723+
case VPInstruction::ExtractFromEnd:
724+
O << "extract-from-end";
725+
break;
695726
case VPInstruction::ComputeReductionResult:
696727
O << "compute-reduction-result";
697728
break;

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -802,7 +802,7 @@ sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR,
802802
}
803803

804804
bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
805-
VPBuilder &Builder) {
805+
VPBuilder &LoopBuilder) {
806806
VPDominatorTree VPDT;
807807
VPDT.recalculate(Plan);
808808

@@ -812,6 +812,8 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
812812
if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
813813
RecurrencePhis.push_back(FOR);
814814

815+
VPBuilder MiddleBuilder(
816+
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor()));
815817
for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
816818
SmallPtrSet<VPFirstOrderRecurrencePHIRecipe *, 4> SeenPhis;
817819
VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
@@ -831,18 +833,28 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
831833
// fixed-order recurrence.
832834
VPBasicBlock *InsertBlock = Previous->getParent();
833835
if (isa<VPHeaderPHIRecipe>(Previous))
834-
Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
836+
LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
835837
else
836-
Builder.setInsertPoint(InsertBlock, std::next(Previous->getIterator()));
838+
LoopBuilder.setInsertPoint(InsertBlock,
839+
std::next(Previous->getIterator()));
837840

838841
auto *RecurSplice = cast<VPInstruction>(
839-
Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
840-
{FOR, FOR->getBackedgeValue()}));
842+
LoopBuilder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
843+
{FOR, FOR->getBackedgeValue()}));
841844

842845
FOR->replaceAllUsesWith(RecurSplice);
843846
// Set the first operand of RecurSplice to FOR again, after replacing
844847
// all users.
845848
RecurSplice->setOperand(0, FOR);
849+
850+
Type *IntTy = Plan.getCanonicalIV()->getScalarType();
851+
auto *Result = cast<VPInstruction>(MiddleBuilder.createNaryOp(
852+
VPInstruction::ExtractFromEnd,
853+
{FOR->getBackedgeValue(),
854+
Plan.getOrAddLiveIn(ConstantInt::get(IntTy, 2))},
855+
{}, "vector.recur.extract.for.phi"));
856+
RecurSplice->replaceUsesWithIf(
857+
Result, [](VPUser &U, unsigned) { return isa<VPLiveOut>(&U); });
846858
}
847859
return true;
848860
}

llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -126,9 +126,9 @@ define i64 @pointer_induction_only(ptr %start, ptr %end) {
126126
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
127127
; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
128128
; CHECK: middle.block:
129+
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
129130
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
130131
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
131-
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
132132
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
133133
; CHECK: scalar.ph:
134134
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -191,8 +191,8 @@ define i64 @int_and_pointer_iv(ptr %start, i32 %N) {
191191
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
192192
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
193193
; CHECK: middle.block:
194-
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
195194
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
195+
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
196196
; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
197197
; CHECK: scalar.ph:
198198
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -332,9 +332,9 @@ define i64 @test_ptr_ivs_and_widened_ivs(ptr %src, i32 %N) {
332332
; DEFAULT-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
333333
; DEFAULT-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
334334
; DEFAULT: middle.block:
335+
; DEFAULT-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP15]], i32 2
335336
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
336337
; DEFAULT-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP15]], i32 3
337-
; DEFAULT-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP15]], i32 2
338338
; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
339339
; DEFAULT: scalar.ph:
340340
; DEFAULT-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]

llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -870,9 +870,9 @@ define i8 @add_phifail2(ptr noalias nocapture readonly %p, ptr noalias nocapture
870870
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
871871
; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
872872
; CHECK: middle.block:
873+
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <16 x i32> [[TMP6]], i32 14
873874
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
874875
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
875-
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <16 x i32> [[TMP6]], i32 14
876876
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
877877
; CHECK: scalar.ph:
878878
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]

0 commit comments

Comments
 (0)