Skip to content

[VPlan] Update final IV exit value via VPlan. #112147

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Jan 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
214 changes: 21 additions & 193 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -543,11 +543,6 @@ class InnerLoopVectorizer {
protected:
friend class LoopVectorizationPlanner;

/// Set up the values of the IVs correctly when exiting the vector loop.
virtual void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
Value *VectorTripCount, BasicBlock *MiddleBlock,
VPTransformState &State);

/// Iteratively sink the scalarized operands of a predicated instruction into
/// the block that was created for it.
void sinkScalarOperands(Instruction *PredInst);
Expand Down Expand Up @@ -785,10 +780,6 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
void printDebugTracesAtStart() override;
void printDebugTracesAtEnd() override;

void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
Value *VectorTripCount, BasicBlock *MiddleBlock,
VPTransformState &State) override {};
};

// A specialized derived class of inner loop vectorizer that performs
Expand Down Expand Up @@ -2782,97 +2773,6 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton(
return LoopVectorPreHeader;
}

// Fix up external users of the induction variable. At this point, we are
// in LCSSA form, with all external PHIs that use the IV having one input value,
// coming from the remainder loop. We need those PHIs to also have a correct
// value for the IV when arriving directly from the middle block.
void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
const InductionDescriptor &II,
Value *VectorTripCount,
BasicBlock *MiddleBlock,
VPTransformState &State) {
// There are two kinds of external IV usages - those that use the value
// computed in the last iteration (the PHI) and those that use the penultimate
// value (the value that feeds into the phi from the loop latch).
// We allow both, but they, obviously, have different values.

DenseMap<Value *, Value *> MissingVals;

Value *EndValue = cast<PHINode>(OrigPhi->getIncomingValueForBlock(
OrigLoop->getLoopPreheader()))
->getIncomingValueForBlock(MiddleBlock);

// An external user of the last iteration's value should see the value that
// the remainder loop uses to initialize its own IV.
Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
for (User *U : PostInc->users()) {
Instruction *UI = cast<Instruction>(U);
if (!OrigLoop->contains(UI)) {
assert(isa<PHINode>(UI) && "Expected LCSSA form");
MissingVals[UI] = EndValue;
}
}

// An external user of the penultimate value need to see EndValue - Step.
// The simplest way to get this is to recompute it from the constituent SCEVs,
// that is Start + (Step * (CRD - 1)).
Comment on lines -2817 to -2818
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This part is obsolete.

for (User *U : OrigPhi->users()) {
auto *UI = cast<Instruction>(U);
if (!OrigLoop->contains(UI)) {
assert(isa<PHINode>(UI) && "Expected LCSSA form");
IRBuilder<> B(MiddleBlock->getTerminator());

// Fast-math-flags propagate from the original induction instruction.
if (isa_and_nonnull<FPMathOperator>(II.getInductionBinOp()))
B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());

VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
assert(StepVPV && "step must have been expanded during VPlan execution");
Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
: State.get(StepVPV, VPLane(0));
Value *Escape = nullptr;
if (EndValue->getType()->isIntegerTy())
Escape = B.CreateSub(EndValue, Step);
else if (EndValue->getType()->isPointerTy())
Escape = B.CreatePtrAdd(EndValue, B.CreateNeg(Step));
else {
assert(EndValue->getType()->isFloatingPointTy() &&
"Unexpected induction type");
Escape = B.CreateBinOp(II.getInductionBinOp()->getOpcode() ==
Instruction::FAdd
? Instruction::FSub
: Instruction::FAdd,
EndValue, Step);
}
Escape->setName("ind.escape");
MissingVals[UI] = Escape;
}
}

assert((MissingVals.empty() ||
all_of(MissingVals,
[MiddleBlock, this](const std::pair<Value *, Value *> &P) {
return all_of(
predecessors(cast<Instruction>(P.first)->getParent()),
[MiddleBlock, this](BasicBlock *Pred) {
return Pred == MiddleBlock ||
Pred == OrigLoop->getLoopLatch();
});
})) &&
"Expected escaping values from latch/middle.block only");

for (auto &I : MissingVals) {
PHINode *PHI = cast<PHINode>(I.first);
// One corner case we have to handle is two IVs "chasing" each-other,
// that is %IV2 = phi [...], [ %IV1, %latch ]
// In this case, if IV1 has an external use, we need to avoid adding both
// "last value of IV1" and "penultimate value of IV2". So, verify that we
// don't already have an incoming value for the middle block.
if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
PHI->addIncoming(I.second, MiddleBlock);
}
}

namespace {

struct CSEDenseMapInfo {
Expand Down Expand Up @@ -2999,24 +2899,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
for (PHINode &PN : Exit->phis())
PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);

if (Cost->requiresScalarEpilogue(VF.isVector())) {
// No edge from the middle block to the unique exit block has been inserted
// and there is nothing to fix from vector loop; phis should have incoming
// from scalar loop only.
} else {
// TODO: Check in VPlan to see if IV users need fixing instead of checking
// the cost model.

// If we inserted an edge from the middle block to the unique exit block,
// update uses outside the loop (phis) to account for the newly inserted
// edge.

// Fix-up external users of the induction variables.
for (const auto &Entry : Legal->getInductionVars())
fixupIVUsers(Entry.first, Entry.second,
getOrCreateVectorTripCount(nullptr), LoopMiddleBlock, State);
}

// Don't apply optimizations below when no vector region remains, as they all
// require a vector loop at the moment.
if (!State.Plan->getVectorLoopRegion())
Expand Down Expand Up @@ -9049,11 +8931,9 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
/// the end value of the induction.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also explain recording \p EndValues.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated, thanks!

static VPValue *addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV,
VPBuilder &VectorPHBuilder,
VPBuilder &ScalarPHBuilder,
VPTypeAnalysis &TypeInfo,
VPValue *VectorTC) {
static VPInstruction *addResumePhiRecipeForInduction(
VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
// Truncated wide inductions resume from the last lane of their vector value
// in the last vector iteration which is handled elsewhere.
Comment on lines 8938 to 8939
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Regarding this being "handled elsewhere", should all exiting IV values be handled first by extracting the last lane, whether resumed by the scalar loop or used in the exit block? Then, as a separate (conceptually optional) subsequent optimization, replace such extractions by precomputing the end values, for both scalar loop and exit block users.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good, this effectively means updating all scalar VPIRINstructions wrapping phis together, using the same logic. Might be good to first land this patch and then consolidate the phi handling, to reduce the diff?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reducing diff here and following up is fine. Worth leaving behind a TODO somewhere.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added the TODO to to place where we call addResumePhiRecipeForInduction

Expand Down Expand Up @@ -9087,8 +8967,10 @@ static VPValue *addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV,

/// Create resume phis in the scalar preheader for first-order recurrences,
/// reductions and inductions, and update the VPIRInstructions wrapping the
/// original phis in the scalar header.
static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
/// original phis in the scalar header. End values for inductions are added to
/// \p IVEndValues.
static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
DenseMap<VPValue *, VPValue *> &IVEndValues) {
VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
auto *ScalarPH = Plan.getScalarPreheader();
auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor());
Expand All @@ -9105,11 +8987,16 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
if (!ScalarPhiI)
break;

// TODO: Extract final value from induction recipe initially, optimize to
// pre-computed end value together in optimizeInductionExitUsers.
auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe(ScalarPhiI));
if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
if (VPValue *ResumePhi = addResumePhiRecipeForInduction(
if (VPInstruction *ResumePhi = addResumePhiRecipeForInduction(
WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
&Plan.getVectorTripCount())) {
assert(ResumePhi->getOpcode() == VPInstruction::ResumePhi &&
"Expected a ResumePhi");
IVEndValues[WideIVR] = ResumePhi->getOperand(0);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: good to assert we know what we're taking the first operand of

Suggested change
IVEndValues[WideIVR] = ResumePhi->getOperand(0);
assert(ResumePhi->getOpcode == VPInstruction::ResumePhi && "Expected a ResumePhi");
IVEndValues[WideIVR] = ResumePhi->getOperand(0);

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done thanks

ScalarPhiIRI->addOperand(ResumePhi);
continue;
}
Expand Down Expand Up @@ -9140,65 +9027,6 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
}
}

/// Return true if \p VPV is an optimizable IV or IV use. That is, if \p VPV is
/// either an untruncated wide induction, or if it increments a wide induction
/// by its step.
static bool isOptimizableIVOrUse(VPValue *VPV) {
VPRecipeBase *Def = VPV->getDefiningRecipe();
if (!Def)
return false;
auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Def);
if (WideIV) {
// VPV itself is a wide induction, separately compute the end value for exit
// users if it is not a truncated IV.
return isa<VPWidenPointerInductionRecipe>(WideIV) ||
!cast<VPWidenIntOrFpInductionRecipe>(WideIV)->getTruncInst();
}

// Check if VPV is an optimizable induction increment.
if (Def->getNumOperands() != 2)
return false;
WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
if (!WideIV)
WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
if (!WideIV)
return false;

using namespace VPlanPatternMatch;
auto &ID = WideIV->getInductionDescriptor();

// Check if VPV increments the induction by the induction step.
VPValue *IVStep = WideIV->getStepValue();
switch (ID.getInductionOpcode()) {
case Instruction::Add:
return match(VPV, m_c_Binary<Instruction::Add>(m_Specific(WideIV),
m_Specific(IVStep)));
case Instruction::FAdd:
return match(VPV, m_c_Binary<Instruction::FAdd>(m_Specific(WideIV),
m_Specific(IVStep)));
case Instruction::FSub:
return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
m_Specific(IVStep)));
case Instruction::Sub: {
// IVStep will be the negated step of the subtraction. Check if Step == -1 *
// IVStep.
VPValue *Step;
if (!match(VPV, m_Binary<Instruction::Sub>(m_VPValue(), m_VPValue(Step))) ||
!Step->isLiveIn() || !IVStep->isLiveIn())
return false;
auto *StepCI = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
auto *IVStepCI = dyn_cast<ConstantInt>(IVStep->getLiveInIRValue());
return StepCI && IVStepCI &&
StepCI->getValue() == (-1 * IVStepCI->getValue());
}
default:
return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
match(VPV, m_GetElementPtr(m_Specific(WideIV),
m_Specific(WideIV->getStepValue())));
}
llvm_unreachable("should have been covered by switch above");
}

// Collect VPIRInstructions for phis in the exit blocks that are modeled
// in VPlan and add the exiting VPValue as operand. Some exiting values are not
// modeled explicitly yet and won't be included. Those are un-truncated
Expand Down Expand Up @@ -9228,12 +9056,6 @@ collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder,
}
Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
// Exit values for inductions are computed and updated outside of VPlan
// and independent of induction recipes.
// TODO: Compute induction exit values in VPlan.
if (isOptimizableIVOrUse(V) &&
ExitVPBB->getSinglePredecessor() == MiddleVPBB)
continue;
ExitUsersToFix.insert(ExitIRI);
ExitIRI->addOperand(V);
}
Expand All @@ -9253,6 +9075,7 @@ addUsersInExitBlocks(VPlan &Plan,

auto *MiddleVPBB = Plan.getMiddleBlock();
VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());

// Introduce extract for exiting values and update the VPIRInstructions
// modeling the corresponding LCSSA phis.
Expand Down Expand Up @@ -9574,7 +9397,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
VPlanTransforms::handleUncountableEarlyExit(
*Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder);
}
addScalarResumePhis(RecipeBuilder, *Plan);
DenseMap<VPValue *, VPValue *> IVEndValues;
addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Exiting IV values can be optimized for both out-of-loop users, both those reaching the scalar loop and those in the exit block. Rather than taking care of the former here and of the latter some 80+ LOC below, passing an IVEndValues map of the end-values created here to be reused there, it would be better to optimize both out-of-loop users together.

Scalar loop resume values use the post-increment IV, i.e., the "end value" of the IV, whereas exit block users may use the post or pre increment IV. However, both types of exit block users also use the "end value" of the IV - the latter with an additional step taken backwards.

So how about doing the following: start with "legalizing" all exiting IV users to extract the last lane, as part of initial VPlan construction. Then, as part of optimizing VPlan, canonicalize exit-block users who use the pre-increment IV, to use the post-increment IV instead, followed with taking the additional step back (at the exit block). Finally, replace every extraction from a post-increment IV feeding out-of-loop users, with a precomputed end-value. WDYT, worth doing so, now or as follow-up?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good, as per comment above would probably be good to do as follow-up, as it would probably further increase the diff if pulled in here?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok. Worth leaving behind a TODO somewhere.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added where addResumePhiRecipeForInduction is called, thanks

SetVector<VPIRInstruction *> ExitUsersToFix =
collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan);
addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
Expand Down Expand Up @@ -9657,6 +9481,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
WithoutRuntimeCheck);
}
VPlanTransforms::optimizeInductionExitUsers(*Plan, IVEndValues);

assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
return Plan;
Expand Down Expand Up @@ -9708,7 +9533,10 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);
}
addScalarResumePhis(RecipeBuilder, *Plan);
DenseMap<VPValue *, VPValue *> IVEndValues;
// TODO: IVEndValues are not used yet in the native path, to optimize exit
// values.
addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);

assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
return Plan;
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -1422,6 +1422,12 @@ class VPIRInstruction : public VPRecipeBase {
"Op must be an operand of the recipe");
return true;
}

bool onlyFirstLaneUsed(const VPValue *Op) const override {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit:

Suggested change
bool onlyFirstLaneUsed(const VPValue *Op) const override {
bool onlyFirstLaneUsed(const VPValue *Op) const override {

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done thanks!

assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return true;
}
Comment on lines +1426 to +1430
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fix can be committed independently, but tested only now?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep

};

/// VPWidenRecipe is a recipe for producing a widened instruction using the
Expand Down
Loading
Loading