Skip to content

Commit d038455

Browse files
committed
[VPlan] Implement interleaving as VPlan-to-VPlan transform.
This patch implements explicit interleaving as VPlan transform. In follow up patches this will allow simplifying VPTransform state (no need to store unrolled parts) as well as recipe execution (no need to generate code for multiple parts in a each recipe). It also allows for more general optimziations (e.g. avoid generating code for recipes that are uniform-across parts). In the initial implementation, a number of recipes still take the unrolled part as additional, optional argument, if their execution depends on the unrolled part. The computation for start/step values for scalable inductions changed slightly. Previously the step would be computed as scalar and then splatted, now vscale gets splatted and multiplied by the step in a vector mul. This has been split off llvm#94339 which also includes changes to simplify VPTransfomState and recipes' ::execute. The current version mostly leaves existing ::execute untouched and instead sets VPTransfomState::UF to 1.
1 parent db0603c commit d038455

27 files changed

+917
-428
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,15 @@ class VPBuilder {
161161
return tryInsertInstruction(
162162
new VPInstruction(Opcode, Operands, WrapFlags, DL, Name));
163163
}
164+
165+
VPInstruction *createFPOp(unsigned Opcode,
166+
std::initializer_list<VPValue *> Operands,
167+
DebugLoc DL = {}, const Twine &Name = "",
168+
FastMathFlags FMFs = {}) {
169+
auto *Op = new VPInstruction(Opcode, Operands, FMFs, DL, Name);
170+
return tryInsertInstruction(Op);
171+
}
172+
164173
VPValue *createNot(VPValue *Operand, DebugLoc DL = {},
165174
const Twine &Name = "") {
166175
return createInstruction(VPInstruction::Not, {Operand}, DL, Name);

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7384,6 +7384,8 @@ LoopVectorizationPlanner::executePlan(
73847384
"expanded SCEVs to reuse can only be used during epilogue vectorization");
73857385
(void)IsEpilogueVectorization;
73867386

7387+
VPlanTransforms::interleave(BestVPlan, BestUF,
7388+
OrigLoop->getHeader()->getModule()->getContext());
73877389
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
73887390

73897391
LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
@@ -9220,6 +9222,87 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
92209222
VPlanTransforms::clearReductionWrapFlags(*Plan);
92219223
}
92229224

9225+
void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
9226+
assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
9227+
"Not a pointer induction according to InductionDescriptor!");
9228+
assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9229+
"Unexpected type.");
9230+
assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
9231+
"Recipe should have been replaced");
9232+
9233+
auto *IVR = getParent()->getPlan()->getCanonicalIV();
9234+
PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
9235+
unsigned CurrentPart = 0;
9236+
if (getNumOperands() == 5)
9237+
CurrentPart =
9238+
cast<ConstantInt>(getOperand(4)->getLiveInIRValue())->getZExtValue();
9239+
Type *PhiType = IndDesc.getStep()->getType();
9240+
9241+
// Build a pointer phi
9242+
Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9243+
Type *ScStValueType = ScalarStartValue->getType();
9244+
PHINode *NewPointerPhi = nullptr;
9245+
9246+
BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9247+
if (getNumOperands() == 5) {
9248+
auto *GEP = cast<GetElementPtrInst>(State.get(getOperand(3), 0));
9249+
NewPointerPhi = cast<PHINode>(GEP->getPointerOperand());
9250+
} else {
9251+
NewPointerPhi =
9252+
PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
9253+
NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9254+
}
9255+
9256+
// A pointer induction, performed by using a gep
9257+
BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
9258+
unsigned UF = getNumOperands() == 2
9259+
? 1
9260+
: cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
9261+
->getZExtValue();
9262+
9263+
Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
9264+
Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9265+
Value *NumUnrolledElems =
9266+
State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, UF));
9267+
// Add induction update using an incorrect block temporarily. The phi node
9268+
// will be fixed after VPlan execution. Note that at this point the latch
9269+
// block cannot be used, as it does not exist yet.
9270+
// TODO: Model increment value in VPlan, by turning the recipe into a
9271+
// multi-def and a subclass of VPHeaderPHIRecipe.
9272+
if (getNumOperands() != 5) {
9273+
Value *InductionGEP = GetElementPtrInst::Create(
9274+
State.Builder.getInt8Ty(), NewPointerPhi,
9275+
State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9276+
InductionLoc);
9277+
9278+
NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9279+
}
9280+
9281+
// Create UF many actual address geps that use the pointer
9282+
// phi as base and a vectorized version of the step value
9283+
// (<step*0, ..., step*N>) as offset.
9284+
for (unsigned Part = 0; Part < State.UF; ++Part) {
9285+
Type *VecPhiType = VectorType::get(PhiType, State.VF);
9286+
Value *StartOffsetScalar = State.Builder.CreateMul(
9287+
RuntimeVF, ConstantInt::get(PhiType, CurrentPart));
9288+
Value *StartOffset =
9289+
State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9290+
// Create a vector of consecutive numbers from zero to VF.
9291+
StartOffset = State.Builder.CreateAdd(
9292+
StartOffset, State.Builder.CreateStepVector(VecPhiType));
9293+
9294+
assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
9295+
"scalar step must be the same across all parts");
9296+
Value *GEP = State.Builder.CreateGEP(
9297+
State.Builder.getInt8Ty(), NewPointerPhi,
9298+
State.Builder.CreateMul(
9299+
StartOffset,
9300+
State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9301+
"vector.gep"));
9302+
State.set(this, GEP, Part);
9303+
}
9304+
}
9305+
92239306
void VPDerivedIVRecipe::execute(VPTransformState &State) {
92249307
assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
92259308

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -931,6 +931,10 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
931931
// FIXME: Model VF * UF computation completely in VPlan.
932932
VFxUF.setUnderlyingValue(
933933
createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF));
934+
if (VF.getNumUsers() > 0) {
935+
VF.setUnderlyingValue(
936+
createStepForVF(Builder, TripCountV->getType(), State.VF, 1));
937+
}
934938

935939
// When vectorizing the epilogue loop, the canonical induction start value
936940
// needs to be changed from zero to the value after the main vector loop.
@@ -974,6 +978,7 @@ static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
974978
/// Assumes a single pre-header basic-block was created for this. Introduce
975979
/// additional basic-blocks as needed, and fill them all.
976980
void VPlan::execute(VPTransformState *State) {
981+
State->UF = 1;
977982
// Initialize CFG state.
978983
State->CFG.PrevVPBB = nullptr;
979984
State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor();
@@ -1048,6 +1053,9 @@ void VPlan::execute(VPTransformState *State) {
10481053
// Move the last step to the end of the latch block. This ensures
10491054
// consistent placement of all induction updates.
10501055
Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
1056+
if (isa<VPWidenIntOrFpInductionRecipe>(&R) && R.getNumOperands() == 4)
1057+
Inc->setOperand(0, State->get(R.getOperand(3), 0));
1058+
10511059
Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
10521060
continue;
10531061
}
@@ -1418,6 +1426,10 @@ void VPlanIngredient::print(raw_ostream &O) const {
14181426

14191427
template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT);
14201428

1429+
bool VPValue::isDefinedOutsideVectorRegions() const {
1430+
return !hasDefiningRecipe() || !getDefiningRecipe()->getParent()->getParent();
1431+
}
1432+
14211433
void VPValue::replaceAllUsesWith(VPValue *New) {
14221434
replaceUsesWithIf(New, [](VPUser &, unsigned) { return true; });
14231435
}

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -727,6 +727,8 @@ class VPLiveOut : public VPUser {
727727

728728
PHINode *getPhi() const { return Phi; }
729729

730+
bool onlyFirstPartUsed(const VPValue *Op) const override { return true; }
731+
730732
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
731733
/// Print the VPLiveOut to \p O.
732734
void print(raw_ostream &O, VPSlotTracker &SlotTracker) const;
@@ -1397,6 +1399,9 @@ class VPInstruction : public VPRecipeWithIRFlags {
13971399
/// Returns true if this VPInstruction's operands are single scalars and the
13981400
/// result is also a single scalar.
13991401
bool isSingleScalar() const;
1402+
1403+
/// Return the interleave count from the VPInstruction's last argument.
1404+
unsigned getInterleaveCount() const;
14001405
};
14011406

14021407
/// VPWidenRecipe is a recipe for producing a widened instruction using the
@@ -1686,6 +1691,9 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags {
16861691
isInBounds(), getDebugLoc());
16871692
}
16881693

1694+
/// Return the current part for this vector pointer.
1695+
unsigned getPartForRecipe() const;
1696+
16891697
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
16901698
/// Print the recipe.
16911699
void print(raw_ostream &O, const Twine &Indent,
@@ -2026,6 +2034,9 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
20262034

20272035
/// Returns true, if the phi is part of an in-loop reduction.
20282036
bool isInLoop() const { return IsInLoop; }
2037+
2038+
/// Return the current part for this scalar step.
2039+
unsigned getPartForRecipe() const;
20292040
};
20302041

20312042
/// A recipe for vectorizing a phi-node as a sequence of mask-based select
@@ -2736,6 +2747,9 @@ class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
27362747
/// Generate the canonical scalar induction phi of the vector loop.
27372748
void execute(VPTransformState &State) override;
27382749

2750+
/// Return the current part for this scalar step.
2751+
unsigned getPartForRecipe() const;
2752+
27392753
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
27402754
/// Print the recipe.
27412755
void print(raw_ostream &O, const Twine &Indent,
@@ -2780,7 +2794,9 @@ class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {
27802794
~VPActiveLaneMaskPHIRecipe() override = default;
27812795

27822796
VPActiveLaneMaskPHIRecipe *clone() override {
2783-
return new VPActiveLaneMaskPHIRecipe(getOperand(0), getDebugLoc());
2797+
auto *R = new VPActiveLaneMaskPHIRecipe(getOperand(0), getDebugLoc());
2798+
R->addOperand(getOperand(1));
2799+
return R;
27842800
}
27852801

27862802
VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskPHISC)
@@ -2858,6 +2874,9 @@ class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe {
28582874
/// step = <VF*UF, VF*UF, ..., VF*UF>.
28592875
void execute(VPTransformState &State) override;
28602876

2877+
/// Return the current part for this scalar step.
2878+
unsigned getPartForRecipe() const;
2879+
28612880
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
28622881
/// Print the recipe.
28632882
void print(raw_ostream &O, const Twine &Indent,
@@ -2970,6 +2989,9 @@ class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags {
29702989
"Op must be an operand of the recipe");
29712990
return true;
29722991
}
2992+
2993+
/// Return the current part for this scalar step.
2994+
unsigned getPartForRecipe() const;
29732995
};
29742996

29752997
/// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
@@ -3294,6 +3316,8 @@ class VPlan {
32943316
/// Represents the loop-invariant VF * UF of the vector loop region.
32953317
VPValue VFxUF;
32963318

3319+
VPValue VF;
3320+
32973321
/// Holds a mapping between Values and their corresponding VPValue inside
32983322
/// VPlan.
32993323
Value2VPValueTy Value2VPValue;
@@ -3388,6 +3412,7 @@ class VPlan {
33883412

33893413
/// Returns VF * UF of the vector loop region.
33903414
VPValue &getVFxUF() { return VFxUF; }
3415+
VPValue &getVF() { return VF; }
33913416

33923417
void addVF(ElementCount VF) { VFs.insert(VF); }
33933418

@@ -3825,6 +3850,29 @@ inline bool isUniformAfterVectorization(const VPValue *VPV) {
38253850

38263851
/// Return true if \p V is a header mask in \p Plan.
38273852
bool isHeaderMask(const VPValue *V, VPlan &Plan);
3853+
3854+
/// Checks if \p C is uniform across all VFs and UFs. It is considered as such
3855+
/// if it is either defined outside the vector region or its operand is known to
3856+
/// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI).
3857+
inline bool isUniformAcrossVFsAndUFs(VPValue *V) {
3858+
if (auto *VPI = dyn_cast_or_null<VPInstruction>(V->getDefiningRecipe())) {
3859+
return VPI ==
3860+
VPI->getParent()->getPlan()->getCanonicalIV()->getBackedgeValue();
3861+
}
3862+
if (isa<VPCanonicalIVPHIRecipe, VPDerivedIVRecipe, VPExpandSCEVRecipe>(V))
3863+
return true;
3864+
if (isa<VPReplicateRecipe>(V) && cast<VPReplicateRecipe>(V)->isUniform() &&
3865+
(isa<LoadInst, StoreInst>(V->getUnderlyingValue())) &&
3866+
all_of(V->getDefiningRecipe()->operands(),
3867+
[](VPValue *Op) { return Op->isDefinedOutsideVectorRegions(); }))
3868+
return true;
3869+
3870+
auto *C = dyn_cast_or_null<VPScalarCastRecipe>(V->getDefiningRecipe());
3871+
return C && (C->isDefinedOutsideVectorRegions() ||
3872+
isa<VPDerivedIVRecipe>(C->getOperand(0)) ||
3873+
isa<VPCanonicalIVPHIRecipe>(C->getOperand(0)));
3874+
}
3875+
38283876
} // end namespace vputils
38293877

38303878
} // end namespace llvm

0 commit comments

Comments
 (0)