Skip to content

[VPlan] Only use selectVectorizationFactor for cross-check (NFCI). #103033

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Aug 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -354,9 +354,10 @@ class LoopVectorizationPlanner {
: OrigLoop(L), LI(LI), DT(DT), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM),
IAI(IAI), PSE(PSE), Hints(Hints), ORE(ORE) {}

/// Plan how to best vectorize, return the best VF and its cost, or
/// std::nullopt if vectorization and interleaving should be avoided up front.
std::optional<VectorizationFactor> plan(ElementCount UserVF, unsigned UserIC);
/// Build VPlans for the specified \p UserVF and \p UserIC if they are
/// non-zero or all applicable candidate VFs otherwise. If vectorization and
/// interleaving should be avoided up-front, no plans are generated.
void plan(ElementCount UserVF, unsigned UserIC);

/// Use the VPlan-native path to plan how to best vectorize, return the best
/// VF and its cost.
Expand All @@ -368,7 +369,7 @@ class LoopVectorizationPlanner {

/// Compute and return the most profitable vectorization factor. Also collect
/// all profitable VFs in ProfitableVFs.
ElementCount computeBestVF();
VectorizationFactor computeBestVF();

/// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan
/// according to the best selected \p VF and \p UF.
Expand Down Expand Up @@ -450,12 +451,14 @@ class LoopVectorizationPlanner {
VPRecipeBuilder &RecipeBuilder,
ElementCount MinVF);

#ifndef NDEBUG
/// \return The most profitable vectorization factor for the available VPlans
/// and the cost of that VF.
/// This is now only used to verify the decisions by the new VPlan-based
/// cost-model and will be retired once the VPlan-based cost-model is
/// stabilized.
VectorizationFactor selectVectorizationFactor();
#endif

/// Returns true if the per-lane cost of VectorizationFactor A is lower than
/// that of B.
Expand Down
92 changes: 37 additions & 55 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4546,6 +4546,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
return false;
}

#ifndef NDEBUG
VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
Expand Down Expand Up @@ -4578,7 +4579,6 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
InstructionCost C = CM.expectedCost(VF);
VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);

#ifndef NDEBUG
unsigned AssumedMinimumVscale =
getVScaleForTuning(OrigLoop, TTI).value_or(1);
unsigned Width =
Expand All @@ -4591,7 +4591,6 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
<< AssumedMinimumVscale << ")");
LLVM_DEBUG(dbgs() << ".\n");
#endif

if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
LLVM_DEBUG(
Expand Down Expand Up @@ -4621,6 +4620,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
return ChosenFactor;
}
#endif

bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
ElementCount VF) const {
Expand Down Expand Up @@ -6985,15 +6985,14 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
return VectorizationFactor::Disabled();
}

std::optional<VectorizationFactor>
LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
assert(OrigLoop->isInnermost() && "Inner loop expected.");
CM.collectValuesToIgnore();
CM.collectElementTypesForWidening();

FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
return std::nullopt;
return;

// Invalidate interleave groups if all blocks of loop will be predicated.
if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
Expand Down Expand Up @@ -7028,14 +7027,8 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
if (CM.selectUserVectorizationFactor(UserVF)) {
LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
buildVPlansWithVPRecipes(UserVF, UserVF);
if (!hasPlanWithVF(UserVF)) {
LLVM_DEBUG(dbgs()
<< "LV: No VPlan could be built for " << UserVF << ".\n");
return std::nullopt;
}

LLVM_DEBUG(printPlans(dbgs()));
return {{UserVF, 0, 0}};
return;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As @alexey-bataev noted, else should be dropped after return (can be done independently). OTOH, perhaps it's better to treat the simpler additional "report UserVF ignored" case first - if (!CM.selectUserVectorizationFactor(UserVF)) - otherwise build a VPlan for UserVF and return.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, will adjust that separately.

} else
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just noting another else-after-return, below, but perhaps there it overall looks better(?):

    return false;
  } else if (!VectorizeLoop && InterleaveLoop) {

reportVectorizationInfo("UserVF ignored because of invalid costs.",
"InvalidCost", ORE, OrigLoop);
Expand Down Expand Up @@ -7066,24 +7059,6 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);

LLVM_DEBUG(printPlans(dbgs()));
if (VPlans.empty())
return std::nullopt;
if (all_of(VPlans,
[](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }))
return VectorizationFactor::Disabled();

// Select the optimal vectorization factor according to the legacy cost-model.
// This is now only used to verify the decisions by the new VPlan-based
// cost-model and will be retired once the VPlan-based cost-model is
// stabilized.
VectorizationFactor VF = selectVectorizationFactor();
assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
if (!hasPlanWithVF(VF.Width)) {
LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width
<< ".\n");
return std::nullopt;
}
return VF;
}

InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
Expand Down Expand Up @@ -7255,18 +7230,21 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
return Cost;
}

ElementCount LoopVectorizationPlanner::computeBestVF() {
VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
if (VPlans.empty())
return VectorizationFactor::Disabled();
// If there is a single VPlan with a single VF, return it directly.
VPlan &FirstPlan = *VPlans[0];
if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
return *FirstPlan.vectorFactors().begin();
return {*FirstPlan.vectorFactors().begin(), 0, 0};
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(Returning zeroes retains current behavior: the single VPlan case corresponds to a UserVF whose cost and scalar-cost are ignored, although the former does have valid cost (the latter is not calculated).)


ElementCount ScalarVF = ElementCount::getFixed(1);
assert(hasPlanWithVF(ScalarVF) &&
"More than a single plan/VF w/o any plan having scalar VF");

// TODO: Compute scalar cost using VPlan-based cost model.
InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
VectorizationFactor BestFactor = ScalarFactor;

Expand Down Expand Up @@ -7300,7 +7278,20 @@ ElementCount LoopVectorizationPlanner::computeBestVF() {
ProfitableVFs.push_back(CurrentFactor);
}
}
return BestFactor.Width;

#ifndef NDEBUG
// Select the optimal vectorization factor according to the legacy cost-model.
// This is now only used to verify the decisions by the new VPlan-based
// cost-model and will be retired once the VPlan-based cost-model is
// stabilized.
VectorizationFactor LegacyVF = selectVectorizationFactor();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

selectVectorizationFactor() should also be declared and defined under #ifndef NDEBUG.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, thanks

assert(BestFactor.Width == LegacyVF.Width &&
" VPlan cost model and legacy cost model disagreed");
assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
"when vectorizing, the scalar cost must be computed.");
#endif

return BestFactor;
}

static void AddRuntimeUnrollDisableMetaData(Loop *L) {
Expand Down Expand Up @@ -9828,21 +9819,19 @@ bool LoopVectorizePass::processLoop(Loop *L) {
ElementCount UserVF = Hints.getWidth();
unsigned UserIC = Hints.getInterleave();

// Plan how to best vectorize, return the best VF and its cost.
std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
// Plan how to best vectorize.
LVP.plan(UserVF, UserIC);
VectorizationFactor VF = LVP.computeBestVF();
unsigned IC = 1;

if (ORE->allowExtraAnalysis(LV_NAME))
LVP.emitInvalidCostRemarks(ORE);

VectorizationFactor VF = VectorizationFactor::Disabled();
unsigned IC = 1;

bool AddBranchWeights =
hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
F->getDataLayout(), AddBranchWeights);
if (MaybeVF) {
VF = *MaybeVF;
if (LVP.hasPlanWithVF(VF.Width)) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better to ask instead if VF is valid, i.e., not Disabled?
If VF is invalid, better avoid working with its Width.
If VF is valid, can assert LVP has a plan with it.
Perhaps a cast of VectorizationFactor to bool could facilitate asking if (VF) to check its validity, as in zero representing invalid.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At the moment, VF.Width may be 1 (disabled), but it is still considered valid, if a plan with VF = 1 exists, for interleaving only.

We could update getBestVF to return Width == 0 if no plans exist or check if no plans exist here instead.

Either would probably be done independently?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, right, VectorizationFactor::Disabled (Width == 1) is distinct from "invalid" (can be Width == 0).
Agreed, computeBestVF() should return the latter instead of the former, when there are no plans.
It is somewhat confusing to ask if LVP has a plan with its best VF or not - one would expect that to be asserted - provided this best VF is valid.
Sure, can be done independently.

// Select the interleave count.
IC = CM.selectInterleaveCount(VF.Width, VF.Cost);

Expand Down Expand Up @@ -9882,7 +9871,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
VectorizeLoop = false;
}

if (!MaybeVF && UserIC > 1) {
if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better to ask instead if VF is valid?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Left as is for now, see comment above.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, would be good to simplify, as a follow-up.
Also should be turned into else if? Independently.

// Tell the user interleaving was avoided up-front, despite being explicitly
// requested.
LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
Expand Down Expand Up @@ -9964,11 +9953,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
&CM, BFI, PSI, Checks);

ElementCount BestVF = LVP.computeBestVF();
assert(BestVF.isScalar() &&
"VPlan cost model and legacy cost model disagreed");
VPlan &BestPlan = LVP.getPlanFor(BestVF);
LVP.executePlan(BestVF, IC, BestPlan, Unroller, DT, false);
VPlan &BestPlan = LVP.getPlanFor(VF.Width);
LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(Perhaps a cast of VectorizationFactor to ElementCount, returning its first field, could simplify using the former wherever the latter is required, although seems better to make sure the latter only is expected, by passing .Width explicitly.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Left as explicitly passing .Width for now. The only users of .Cost are selectInterleaveCount, which only uses it to check if the loop is effectively empty (should not need to check the cost for that using VPlan) and areRuntimeChecksProfitable which may be sunk into getBestVF

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

selectInterleaveCount does use .Cost "to interleave small loops in order to reduce the loop overhead", but is already set to obtain the cost directly if the one provided is zero - uncomputed (later treating zero cost as "free" ... loop is effectively empty).

areRuntimeChecksProfitable also uses ScalarCost, noting that "The scalar cost should only be 0 when vectorizing with a user specified VF/IC ...", sigh. Anyhow, agree that bailing-out due to unprofitable runtime checks belongs in planning and getting the best VF, rather than later.


ORE->emit([&]() {
return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
Expand All @@ -9979,20 +9965,16 @@ bool LoopVectorizePass::processLoop(Loop *L) {
} else {
// If we decided that it is *legal* to vectorize the loop, then do it.

ElementCount BestVF = LVP.computeBestVF();
LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << BestVF << "\n");
assert(VF.Width == BestVF &&
"VPlan cost model and legacy cost model disagreed");
VPlan &BestPlan = LVP.getPlanFor(BestVF);
VPlan &BestPlan = LVP.getPlanFor(VF.Width);
// Consider vectorizing the epilogue too if it's profitable.
VectorizationFactor EpilogueVF =
LVP.selectEpilogueVectorizationFactor(BestVF, IC);
LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
if (EpilogueVF.Width.isVector()) {

// The first pass vectorizes the main loop and creates a scalar epilogue
// to be vectorized by executing the plan (potentially with a different
// factor) again shortly afterwards.
EpilogueLoopVectorizationInfo EPI(BestVF, IC, EpilogueVF.Width, 1);
EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
EPI, &LVL, &CM, BFI, PSI, Checks);

Expand Down Expand Up @@ -10087,10 +10069,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
if (!MainILV.areSafetyChecksAdded())
DisableRuntimeUnroll = true;
} else {
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, BestVF,
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
PSI, Checks);
LVP.executePlan(BestVF, IC, BestPlan, LB, DT, false);
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
++LoopsVectorized;

// Add metadata to disable runtime unrolling a scalar loop when there
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1695,6 +1695,10 @@ VPlan &LoopVectorizationPlanner::getPlanFor(ElementCount VF) const {

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
if (VPlans.empty()) {
O << "LV: No VPlans built.\n";
return;
}
for (const auto &Plan : VPlans)
if (PrintVPlansInDotFormat)
Plan->printDOT(O);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,6 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Not Interleaving.
; CHECK-NEXT: LV: Interleaving is not beneficial.
; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in <stdin>
; CHECK-NEXT: VF picked by VPlan cost model: vscale x 4
; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop
; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1
; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF>=1' {
Expand Down Expand Up @@ -336,7 +335,6 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Not Interleaving.
; CHECK-NEXT: LV: Interleaving is not beneficial.
; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in <stdin>
; CHECK-NEXT: VF picked by VPlan cost model: vscale x 4
; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop
; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1
; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF>=1' {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ exit:
; FOR (for.y) should be moved which is not currently supported.
define i32 @test_chained_first_order_recurrences_4(ptr %base) {
; CHECK-LABEL: 'test_chained_first_order_recurrences_4'
; CHECK: No VPlan could be built for
; CHECK: No VPlans built.

entry:
br label %loop
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,13 @@ target triple = "x86_64-unknown-linux-gnu"
define void @test() {
; CHECK-LABEL: 'test'
; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
; CHECK: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
; CHECK: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
; CHECK: LV: Found an estimated cost of 3 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
; CHECK: LV: Found an estimated cost of 5 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
; CHECK: LV: Found an estimated cost of 22 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
; CHECK: LV: Found an estimated cost of 92 for VF 64 For instruction: %v0 = load float, ptr %in0, align 4
; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
;
entry:
br label %for.body
Expand Down
Loading