Skip to content

Commit 8b4bffc

Browse files
committed
[VPlan] Model FOR extract of exit value in VPlan.
This patch introduces a new ExtractRecurrenceResult VPInstruction opcode to extract the value of a FOR for users outside the loop (i.e. in the scalar loop's exits). This moves the first part of fixing first order recurrences to VPlan, and removes some additional code to patch up live-outs, which is now handled automatically. The majority of test changes is due to changes in the order of which the extracts are generated now. As we are now using VPTransformState to generate the extracts, we may be able to re-use existing extracts in the loop body in some cases. For scalable vectors, in some cases we now have to compute the runtime VF twice, as each extract is now independent, but those should be trivial to clean up for later passes (and in line with other places in the code that also liberally re-compute runtime VFs).
1 parent ac17fbc commit 8b4bffc

10 files changed

+93
-91
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 0 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -3536,44 +3536,6 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence(
35363536
Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract");
35373537
}
35383538

3539-
auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin());
3540-
assert(PhiR->getNumUsers() == 1 &&
3541-
RecurSplice->getOpcode() ==
3542-
VPInstruction::FirstOrderRecurrenceSplice &&
3543-
"recurrence phi must have a single user: FirstOrderRecurrenceSplice");
3544-
SmallVector<VPLiveOut *> LiveOuts;
3545-
for (VPUser *U : RecurSplice->users())
3546-
if (auto *LiveOut = dyn_cast<VPLiveOut>(U))
3547-
LiveOuts.push_back(LiveOut);
3548-
3549-
if (!LiveOuts.empty()) {
3550-
// Extract the second last element in the middle block if the
3551-
// Phi is used outside the loop. We need to extract the phi itself
3552-
// and not the last element (the phi update in the current iteration). This
3553-
// will be the value when jumping to the exit block from the
3554-
// LoopMiddleBlock, when the scalar loop is not run at all.
3555-
Value *ExtractForPhiUsedOutsideLoop = nullptr;
3556-
if (VF.isVector()) {
3557-
auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3558-
ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3559-
Incoming, Idx, "vector.recur.extract.for.phi");
3560-
} else {
3561-
assert(UF > 1 && "VF and UF cannot both be 1");
3562-
// When loop is unrolled without vectorizing, initialize
3563-
// ExtractForPhiUsedOutsideLoop with the value just prior to unrolled
3564-
// value of `Incoming`. This is analogous to the vectorized case above:
3565-
// extracting the second last element when VF > 1.
3566-
ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3567-
}
3568-
3569-
for (VPLiveOut *LiveOut : LiveOuts) {
3570-
assert(!Cost->requiresScalarEpilogue(VF.isVector()));
3571-
PHINode *LCSSAPhi = LiveOut->getPhi();
3572-
LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3573-
State.Plan->removeLiveOut(LCSSAPhi);
3574-
}
3575-
}
3576-
35773539
// Fix the initial value of the original recurrence in the scalar loop.
35783540
Builder.SetInsertPoint(LoopScalarPreHeader, LoopScalarPreHeader->begin());
35793541
PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -167,8 +167,8 @@ class VPLane {
167167

168168
static VPLane getFirstLane() { return VPLane(0, VPLane::Kind::First); }
169169

170-
static VPLane getLastLaneForVF(const ElementCount &VF) {
171-
unsigned LaneOffset = VF.getKnownMinValue() - 1;
170+
static VPLane getLastLaneForVF(const ElementCount &VF, unsigned Offset = 1) {
171+
unsigned LaneOffset = VF.getKnownMinValue() - Offset;
172172
Kind LaneKind;
173173
if (VF.isScalable())
174174
// In this case 'LaneOffset' refers to the offset from the start of the
@@ -1179,6 +1179,7 @@ class VPInstruction : public VPRecipeWithIRFlags {
11791179
BranchOnCount,
11801180
BranchOnCond,
11811181
ComputeReductionResult,
1182+
ExtractRecurrenceResult,
11821183
LogicalAnd, // Non-poison propagating logical And.
11831184
// Add an offset in bytes (second operand) to a base pointer (first
11841185
// operand). Only generates scalar values (either for the first lane only or
@@ -3612,7 +3613,8 @@ inline bool isUniformAfterVectorization(VPValue *VPV) {
36123613
if (auto *GEP = dyn_cast<VPWidenGEPRecipe>(Def))
36133614
return all_of(GEP->operands(), isUniformAfterVectorization);
36143615
if (auto *VPI = dyn_cast<VPInstruction>(Def))
3615-
return VPI->getOpcode() == VPInstruction::ComputeReductionResult;
3616+
return VPI->getOpcode() == VPInstruction::ComputeReductionResult ||
3617+
VPI->getOpcode() == VPInstruction::ExtractRecurrenceResult;
36163618
return false;
36173619
}
36183620
} // end namespace vputils

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
137137
case VPInstruction::Not:
138138
case VPInstruction::CalculateTripCountMinusVF:
139139
case VPInstruction::CanonicalIVIncrementForPart:
140+
case VPInstruction::ExtractRecurrenceResult:
140141
case VPInstruction::LogicalAnd:
141142
case VPInstruction::PtrAdd:
142143
return false;
@@ -300,6 +301,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
300301
case VPInstruction::CalculateTripCountMinusVF:
301302
case VPInstruction::CanonicalIVIncrementForPart:
302303
case VPInstruction::ComputeReductionResult:
304+
case VPInstruction::ExtractRecurrenceResult:
303305
case VPInstruction::PtrAdd:
304306
case VPInstruction::ExplicitVectorLength:
305307
return true;
@@ -558,6 +560,27 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
558560

559561
return ReducedPartRdx;
560562
}
563+
case VPInstruction::ExtractRecurrenceResult: {
564+
if (Part != 0)
565+
return State.get(this, 0, /*IsScalar*/ true);
566+
567+
// Extract the second last element in the middle block for users outside the
568+
// loop.
569+
Value *Res;
570+
if (State.VF.isVector()) {
571+
Res = State.get(
572+
getOperand(0),
573+
VPIteration(State.UF - 1, VPLane::getLastLaneForVF(State.VF, 2)));
574+
} else {
575+
assert(State.UF > 1 && "VF and UF cannot both be 1");
576+
// When loop is unrolled without vectorizing, retrieve the value just
577+
// prior to the final unrolled value. This is analogous to the vectorized
578+
// case above: extracting the second last element when VF > 1.
579+
Res = State.get(getOperand(0), State.UF - 2);
580+
}
581+
Res->setName(Name);
582+
return Res;
583+
}
561584
case VPInstruction::LogicalAnd: {
562585
Value *A = State.get(getOperand(0), Part);
563586
Value *B = State.get(getOperand(1), Part);
@@ -598,6 +621,7 @@ void VPInstruction::execute(VPTransformState &State) {
598621
bool GeneratesPerFirstLaneOnly =
599622
canGenerateScalarForFirstLane() &&
600623
(vputils::onlyFirstLaneUsed(this) ||
624+
getOpcode() == VPInstruction::ExtractRecurrenceResult ||
601625
getOpcode() == VPInstruction::ComputeReductionResult);
602626
bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
603627
for (unsigned Part = 0; Part < State.UF; ++Part) {
@@ -692,6 +716,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
692716
case VPInstruction::BranchOnCount:
693717
O << "branch-on-count";
694718
break;
719+
case VPInstruction::ExtractRecurrenceResult:
720+
O << "extract-recurrence-result";
721+
break;
695722
case VPInstruction::ComputeReductionResult:
696723
O << "compute-reduction-result";
697724
break;

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -812,6 +812,8 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
812812
if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
813813
RecurrencePhis.push_back(FOR);
814814

815+
VPBuilder MiddleBuilder(
816+
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor()));
815817
for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
816818
SmallPtrSet<VPFirstOrderRecurrencePHIRecipe *, 4> SeenPhis;
817819
VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
@@ -843,6 +845,12 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
843845
// Set the first operand of RecurSplice to FOR again, after replacing
844846
// all users.
845847
RecurSplice->setOperand(0, FOR);
848+
849+
auto *Result = cast<VPInstruction>(MiddleBuilder.createNaryOp(
850+
VPInstruction::ExtractRecurrenceResult, {FOR->getBackedgeValue()}, {},
851+
"vector.recur.extract.for.phi"));
852+
RecurSplice->replaceUsesWithIf(
853+
Result, [](VPUser &U, unsigned) { return isa<VPLiveOut>(&U); });
846854
}
847855
return true;
848856
}

llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -126,9 +126,9 @@ define i64 @pointer_induction_only(ptr %start, ptr %end) {
126126
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
127127
; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
128128
; CHECK: middle.block:
129+
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
129130
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
130131
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
131-
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
132132
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
133133
; CHECK: scalar.ph:
134134
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -191,8 +191,8 @@ define i64 @int_and_pointer_iv(ptr %start, i32 %N) {
191191
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
192192
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
193193
; CHECK: middle.block:
194-
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
195194
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
195+
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
196196
; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
197197
; CHECK: scalar.ph:
198198
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -332,9 +332,9 @@ define i64 @test_ptr_ivs_and_widened_ivs(ptr %src, i32 %N) {
332332
; DEFAULT-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
333333
; DEFAULT-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
334334
; DEFAULT: middle.block:
335+
; DEFAULT-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP15]], i32 2
335336
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
336337
; DEFAULT-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP15]], i32 3
337-
; DEFAULT-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP15]], i32 2
338338
; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
339339
; DEFAULT: scalar.ph:
340340
; DEFAULT-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]

llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -870,9 +870,9 @@ define i8 @add_phifail2(ptr noalias nocapture readonly %p, ptr noalias nocapture
870870
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
871871
; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
872872
; CHECK: middle.block:
873+
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <16 x i32> [[TMP6]], i32 14
873874
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
874875
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
875-
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <16 x i32> [[TMP6]], i32 14
876876
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
877877
; CHECK: scalar.ph:
878878
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]

0 commit comments

Comments
 (0)