Skip to content

Commit a0b61ad

Browse files
committed
[WIP][VPlan based] Clamp VF range in VPlan transformation
1 parent d967afe commit a0b61ad

File tree

6 files changed

+146
-180
lines changed

6 files changed

+146
-180
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 7 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -1329,15 +1329,6 @@ class LoopVectorizationCostModel {
13291329
return InterleaveInfo.getInterleaveGroup(Instr);
13301330
}
13311331

1332-
/// Returns true if \p I is a memory instruction with strided memory access
1333-
/// that can be vectorized.
1334-
bool stridedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
1335-
1336-
/// Get the stride information of the strided memory accesses.
1337-
SmallDenseMap<Instruction *, int64_t> getStrideInfo() const {
1338-
return StrideInfo;
1339-
}
1340-
13411332
/// Returns true if we're required to use a scalar epilogue for at least
13421333
/// the final iteration of the original loop.
13431334
bool requiresScalarEpilogue(bool IsVectorizing) const {
@@ -1592,10 +1583,6 @@ class LoopVectorizationCostModel {
15921583
/// element)
15931584
InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
15941585

1595-
/// The cost computation for strided load/store instruction.
1596-
InstructionCost getStridedLoadStoreCost(Instruction *I,
1597-
ElementCount VF) const;
1598-
15991586
/// Estimate the overhead of scalarizing an instruction. This is a
16001587
/// convenience wrapper for the type-based getScalarizationOverhead API.
16011588
InstructionCost getScalarizationOverhead(Instruction *I,
@@ -1735,9 +1722,6 @@ class LoopVectorizationCostModel {
17351722
Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
17361723
}
17371724

1738-
/// The mapping of memory access instructions to their stride values.
1739-
SmallDenseMap<Instruction *, int64_t> StrideInfo;
1740-
17411725
public:
17421726
/// The loop that we evaluate.
17431727
Loop *TheLoop;
@@ -3295,31 +3279,6 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
32953279
return true;
32963280
}
32973281

3298-
bool LoopVectorizationCostModel::stridedAccessCanBeWidened(
3299-
Instruction *I, ElementCount VF) const {
3300-
// Get and ensure we have a valid memory instruction.
3301-
assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3302-
3303-
// Only support strided access for vector VF.
3304-
if (!VF.isVector())
3305-
return false;
3306-
3307-
// FIXME: Remove this check for StoreInst after strided store is supported.
3308-
if (isa<StoreInst>(I))
3309-
return false;
3310-
3311-
[[maybe_unused]] auto *Ptr = getLoadStorePointerOperand(I);
3312-
auto *ScalarTy = getLoadStoreType(I);
3313-
// TODO: Support non-unit-reverse strided accesses. Add stride analysis here
3314-
// to ensure that the accessed addresses are evenly spaced apart by a fixed
3315-
// stride.
3316-
assert(Legal->isConsecutivePtr(ScalarTy, Ptr) == -1 &&
3317-
"Only supports strided accesses with a stride of -1");
3318-
3319-
const Align Alignment = getLoadStoreAlignment(I);
3320-
return TTI.isLegalStridedLoadStore(toVectorTy(ScalarTy, VF), Alignment);
3321-
}
3322-
33233282
void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
33243283
// We should not collect Uniforms more than once per VF. Right now,
33253284
// this function is called from collectUniformsAndScalars(), which
@@ -5723,19 +5682,6 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
57235682
return Cost;
57245683
}
57255684

5726-
InstructionCost
5727-
LoopVectorizationCostModel::getStridedLoadStoreCost(Instruction *I,
5728-
ElementCount VF) const {
5729-
Type *ValTy = getLoadStoreType(I);
5730-
auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5731-
const Align Alignment = getLoadStoreAlignment(I);
5732-
const Value *Ptr = getLoadStorePointerOperand(I);
5733-
5734-
return TTI.getStridedMemoryOpCost(I->getOpcode(), VectorTy, Ptr,
5735-
Legal->isMaskRequired(I), Alignment,
5736-
CostKind, I);
5737-
}
5738-
57395685
std::optional<InstructionCost>
57405686
LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
57415687
ElementCount VF,
@@ -6055,17 +6001,6 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
60556001
"Expected consecutive stride.");
60566002
InstWidening Decision =
60576003
ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6058-
// Consider using strided load/store for consecutive reverse accesses to
6059-
// achieve more efficient memory operations.
6060-
if (ConsecutiveStride == -1 && stridedAccessCanBeWidened(&I, VF)) {
6061-
const InstructionCost StridedLoadStoreCost =
6062-
getStridedLoadStoreCost(&I, VF);
6063-
if (StridedLoadStoreCost < Cost) {
6064-
Decision = CM_Strided;
6065-
Cost = StridedLoadStoreCost;
6066-
StrideInfo[&I] = ConsecutiveStride;
6067-
}
6068-
}
60696004
setWideningDecision(&I, VF, Decision, Cost);
60706005
continue;
60716006
}
@@ -9478,12 +9413,15 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range,
94789413
// clamp the range for better cost estimation.
94799414
// TODO: Enable following transform when the EVL-version of extended-reduction
94809415
// and mulacc-reduction are implemented.
9481-
if (!CM.foldTailWithEVL()) {
9482-
VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
9483-
CM.CostKind);
9416+
VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
9417+
CM.CostKind);
9418+
if (!CM.foldTailWithEVL())
94849419
VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
94859420
CostCtx, Range);
9486-
}
9421+
9422+
// !!! NEED COMMENT
9423+
VPlanTransforms::runPass(VPlanTransforms::convertToStridedAccesses, *Plan,
9424+
CostCtx, Range);
94879425

94889426
for (ElementCount VF : Range)
94899427
Plan->addVF(VF);
@@ -9495,9 +9433,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range,
94959433
VPlanTransforms::runPass(VPlanTransforms::createInterleaveGroups, *Plan,
94969434
InterleaveGroups, RecipeBuilder,
94979435
CM.isScalarEpilogueAllowed());
9498-
// !!! NEED COMMENT
9499-
VPlanTransforms::runPass(VPlanTransforms::convertToStridedAccesses, *Plan,
9500-
CM.getStrideInfo());
95019436

95029437
// Replace VPValues for known constant strides guaranteed by predicate scalar
95039438
// evolution.

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1718,6 +1718,8 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
17181718

17191719
VP_CLASSOF_IMPL(VPDef::VPVectorEndPointerSC)
17201720

1721+
VPValue *getPtr() const { return getOperand(0); }
1722+
17211723
VPValue *getVFValue() { return getOperand(1); }
17221724
const VPValue *getVFValue() const { return getOperand(1); }
17231725

@@ -3161,10 +3163,6 @@ struct VPWidenStridedLoadRecipe final : public VPWidenMemoryRecipe,
31613163
/// Generate a strided load.
31623164
void execute(VPTransformState &State) override;
31633165

3164-
/// Return the cost of this VPWidenStridedLoadRecipe.
3165-
InstructionCost computeCost(ElementCount VF,
3166-
VPCostContext &Ctx) const override;
3167-
31683166
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
31693167
/// Print the recipe.
31703168
void print(raw_ostream &O, const Twine &Indent,

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 11 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2316,7 +2316,7 @@ void VPVectorEndPointerRecipe::execute(VPTransformState &State) {
23162316
ConstantInt::get(IndexTy, -(int64_t)CurrentPart), RunTimeVF);
23172317
// LastLane = 1 - RunTimeVF
23182318
Value *LastLane = Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
2319-
Value *Ptr = State.get(getOperand(0), VPLane(0));
2319+
Value *Ptr = State.get(getPtr(), VPLane(0));
23202320
Value *ResultPtr =
23212321
Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags());
23222322
ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "",
@@ -2923,9 +2923,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
29232923
getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
29242924
unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
29252925
->getAddressSpace();
2926-
unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)
2927-
? Instruction::Load
2928-
: Instruction::Store;
2926+
unsigned Opcode =
2927+
isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe, VPWidenStridedLoadRecipe>(
2928+
this)
2929+
? Instruction::Load
2930+
: Instruction::Store;
29292931

29302932
if (!Consecutive) {
29312933
// TODO: Using the original IR may not be accurate.
@@ -2934,6 +2936,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
29342936
const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
29352937
assert(!Reverse &&
29362938
"Inconsecutive memory access should not have the order.");
2939+
2940+
if (isa<VPWidenStridedLoadRecipe>(this))
2941+
return Ctx.TTI.getStridedMemoryOpCost(
2942+
Opcode, Ty, Ptr, IsMasked, Alignment, Ctx.CostKind, &Ingredient);
2943+
29372944
return Ctx.TTI.getAddressComputationCost(Ty) +
29382945
Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment,
29392946
Ctx.CostKind, &Ingredient);
@@ -3128,18 +3135,6 @@ void VPWidenStridedLoadRecipe::print(raw_ostream &O, const Twine &Indent,
31283135
}
31293136
#endif
31303137

3131-
InstructionCost
3132-
VPWidenStridedLoadRecipe::computeCost(ElementCount VF,
3133-
VPCostContext &Ctx) const {
3134-
Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
3135-
const Align Alignment = getLoadStoreAlignment(&Ingredient);
3136-
const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
3137-
3138-
return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty, Ptr,
3139-
IsMasked, Alignment, Ctx.CostKind,
3140-
&Ingredient);
3141-
}
3142-
31433138
void VPWidenStoreRecipe::execute(VPTransformState &State) {
31443139
VPValue *StoredVPValue = getStoredValue();
31453140
bool CreateScatter = !isConsecutive();

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 56 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -2517,48 +2517,68 @@ void VPlanTransforms::createInterleaveGroups(
25172517
}
25182518
}
25192519

2520-
void VPlanTransforms::convertToStridedAccesses(
2521-
VPlan &Plan, const SmallDenseMap<Instruction *, int64_t> &StrideInfo) {
2522-
// !!! FIXME: Should remove StrideInfo for next step.
2523-
if (Plan.hasScalarVFOnly() || StrideInfo.empty())
2520+
void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
2521+
VFRange &Range) {
2522+
if (Plan.hasScalarVFOnly())
25242523
return;
25252524

2526-
// !!! FIXME: Should clamp VF for legal and cost in next step
25272525
SmallVector<VPRecipeBase *> ToErase;
25282526
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
25292527
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
25302528
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2531-
// !!! FIXME: Should use LoadR->isReverse() for next step
2532-
if (auto *LoadR = dyn_cast<VPWidenLoadRecipe>(&R);
2533-
LoadR && !LoadR->isConsecutive()) {
2534-
auto *LI = cast<LoadInst>(&LoadR->getIngredient());
2535-
auto It = StrideInfo.find(LI);
2536-
if (It == StrideInfo.end())
2537-
continue;
2538-
int64_t Stride = It->second;
2539-
assert(Stride == -1 &&
2540-
"Only stride memory access with a stride of -1 is supported.");
2541-
// !!! FIXME: Should get VPVectorEndPointerRecipe for reverse
2542-
VPValue *Ptr = LoadR->getAddr();
2543-
auto *GEP = dyn_cast<GetElementPtrInst>(
2544-
Ptr->getUnderlyingValue()->stripPointerCasts());
2545-
auto *NewPtr = new VPVectorPointerRecipe(
2546-
Ptr, getLoadStoreType(LI), /*Stride*/ true,
2547-
GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(),
2548-
LoadR->getDebugLoc());
2549-
NewPtr->insertBefore(LoadR);
2550-
2551-
const DataLayout &DL = LI->getDataLayout();
2552-
auto *StrideTy = DL.getIndexType(LI->getPointerOperand()->getType());
2553-
VPValue *StrideVPV = Plan.getOrAddLiveIn(ConstantInt::get(
2554-
StrideTy, Stride * DL.getTypeAllocSize(getLoadStoreType(LI))));
2555-
auto *StridedLoad = new VPWidenStridedLoadRecipe(
2556-
*LI, NewPtr, StrideVPV, &Plan.getVF(), LoadR->getMask(), *LoadR,
2557-
LoadR->getDebugLoc());
2558-
StridedLoad->insertBefore(LoadR);
2559-
LoadR->replaceAllUsesWith(StridedLoad);
2560-
ToErase.push_back(LoadR);
2561-
}
2529+
auto *MemR = dyn_cast<VPWidenMemoryRecipe>(&R);
2530+
// TODO: support strided store
2531+
// TODO: support strided accesses with stride not equal to -1
2532+
if (!MemR || !isa<VPWidenLoadRecipe>(MemR) || !MemR->isReverse())
2533+
continue;
2534+
2535+
Instruction &Ingredient = MemR->getIngredient();
2536+
Type *ElementTy = getLoadStoreType(&Ingredient);
2537+
2538+
auto IsProfitable = [&](ElementCount VF) -> bool {
2539+
Type *DataTy = toVectorTy(ElementTy, VF);
2540+
const Align Alignment = getLoadStoreAlignment(&Ingredient);
2541+
if (!Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment))
2542+
return false;
2543+
const InstructionCost CurrentCost = MemR->computeCost(VF, Ctx);
2544+
const InstructionCost StridedLoadStoreCost =
2545+
Ctx.TTI.getStridedMemoryOpCost(
2546+
Ingredient.getOpcode(), DataTy,
2547+
getLoadStorePointerOperand(&Ingredient), MemR->isMasked(),
2548+
Alignment, Ctx.CostKind, &Ingredient);
2549+
return StridedLoadStoreCost < CurrentCost;
2550+
};
2551+
2552+
if (!LoopVectorizationPlanner::getDecisionAndClampRange(IsProfitable,
2553+
Range))
2554+
continue;
2555+
2556+
// The stride of consecutive reverse access must be -1.
2557+
int64_t Stride = -1;
2558+
auto *VecEndPtr = cast<VPVectorEndPointerRecipe>(MemR->getAddr());
2559+
VPValue *Ptr = VecEndPtr->getPtr();
2560+
auto *GEP = dyn_cast<GetElementPtrInst>(
2561+
Ptr->getUnderlyingValue()->stripPointerCasts());
2562+
// Create a new vector pointer for strided access.
2563+
auto *NewPtr = new VPVectorPointerRecipe(
2564+
Ptr, ElementTy, /*Stride=*/ true,
2565+
GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(),
2566+
VecEndPtr->getDebugLoc());
2567+
NewPtr->insertBefore(MemR);
2568+
2569+
auto *LoadR = cast<VPWidenLoadRecipe>(MemR);
2570+
auto *LI = cast<LoadInst>(&Ingredient);
2571+
const DataLayout &DL = LI->getDataLayout();
2572+
auto *StrideTy = DL.getIndexType(LI->getPointerOperand()->getType());
2573+
VPValue *StrideVPV = Plan.getOrAddLiveIn(ConstantInt::get(
2574+
StrideTy, Stride * DL.getTypeAllocSize(ElementTy)));
2575+
auto *StridedLoad = new VPWidenStridedLoadRecipe(
2576+
*LI, NewPtr, StrideVPV, &Plan.getVF(), LoadR->getMask(), *LoadR,
2577+
LoadR->getDebugLoc());
2578+
StridedLoad->insertBefore(LoadR);
2579+
LoadR->replaceAllUsesWith(StridedLoad);
2580+
2581+
ToErase.append({LoadR, VecEndPtr});
25622582
}
25632583
}
25642584

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -172,8 +172,8 @@ struct VPlanTransforms {
172172
VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed);
173173

174174
// !!! NEED COMMENT
175-
static void convertToStridedAccesses(
176-
VPlan &Plan, const SmallDenseMap<Instruction *, int64_t> &StrideInfo);
175+
static void convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
176+
VFRange &Range);
177177

178178
/// Remove dead recipes from \p Plan.
179179
static void removeDeadRecipes(VPlan &Plan);

0 commit comments

Comments
 (0)