Skip to content

Commit 67b84db

Browse files
committed
Init: New Recipe VPWidenStridedLoadRecipe
1 parent f687ed9 commit 67b84db

10 files changed

+453
-256
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 105 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1092,6 +1092,7 @@ class LoopVectorizationCostModel {
10921092
CM_Widen_Reverse, // For consecutive accesses with stride -1.
10931093
CM_Interleave,
10941094
CM_GatherScatter,
1095+
CM_Strided,
10951096
CM_Scalarize,
10961097
CM_VectorCall,
10971098
CM_IntrinsicCall
@@ -1325,6 +1326,20 @@ class LoopVectorizationCostModel {
13251326
return InterleaveInfo.getInterleaveGroup(Instr);
13261327
}
13271328

1329+
/// Returns true if \p I is a memory instruction with strided memory access
1330+
/// that can be vectorized.
1331+
bool stridedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
1332+
1333+
/// Get the stride of the strided memory access instruction \p Instr. Return 0
1334+
/// if the instruction \p Instr is not considered for vectorization as a
1335+
/// strided memory access.
1336+
int64_t getStride(Instruction *Instr) const {
1337+
auto It = StrideInfo.find(Instr);
1338+
if (It != StrideInfo.end())
1339+
return It->second;
1340+
return 0;
1341+
}
1342+
13281343
/// Returns true if we're required to use a scalar epilogue for at least
13291344
/// the final iteration of the original loop.
13301345
bool requiresScalarEpilogue(bool IsVectorizing) const {
@@ -1579,6 +1594,10 @@ class LoopVectorizationCostModel {
15791594
/// element)
15801595
InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
15811596

1597+
/// The cost computation for strided load/store instruction.
1598+
InstructionCost getStridedLoadStoreCost(Instruction *I,
1599+
ElementCount VF) const;
1600+
15821601
/// Estimate the overhead of scalarizing an instruction. This is a
15831602
/// convenience wrapper for the type-based getScalarizationOverhead API.
15841603
InstructionCost getScalarizationOverhead(Instruction *I,
@@ -1718,6 +1737,9 @@ class LoopVectorizationCostModel {
17181737
Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
17191738
}
17201739

1740+
/// The mapping of memory access instructions to their stride values.
1741+
DenseMap<Instruction *, int64_t> StrideInfo;
1742+
17211743
public:
17221744
/// The loop that we evaluate.
17231745
Loop *TheLoop;
@@ -3275,6 +3297,31 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
32753297
return true;
32763298
}
32773299

3300+
bool LoopVectorizationCostModel::stridedAccessCanBeWidened(
3301+
Instruction *I, ElementCount VF) const {
3302+
// Get and ensure we have a valid memory instruction.
3303+
assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3304+
3305+
// Only support strided access for vector VF.
3306+
if (!VF.isVector())
3307+
return false;
3308+
3309+
// FIXME: Remove this check for StoreInst after strided store is supported.
3310+
if (isa<StoreInst>(I))
3311+
return false;
3312+
3313+
[[maybe_unused]] auto *Ptr = getLoadStorePointerOperand(I);
3314+
auto *ScalarTy = getLoadStoreType(I);
3315+
// TODO: Support non-unit-reverse strided accesses. Add stride analysis here
3316+
// to ensure that the accessed addresses are evenly spaced apart by a fixed
3317+
// stride.
3318+
assert(Legal->isConsecutivePtr(ScalarTy, Ptr) == -1 &&
3319+
"Only supports strided accesses with a stride of -1");
3320+
3321+
const Align Alignment = getLoadStoreAlignment(I);
3322+
return TTI.isLegalStridedLoadStore(toVectorTy(ScalarTy, VF), Alignment);
3323+
}
3324+
32783325
void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
32793326
// We should not collect Uniforms more than once per VF. Right now,
32803327
// this function is called from collectUniformsAndScalars(), which
@@ -3365,9 +3412,9 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
33653412
if (IsUniformMemOpUse(I))
33663413
return true;
33673414

3368-
return (WideningDecision == CM_Widen ||
3369-
WideningDecision == CM_Widen_Reverse ||
3370-
WideningDecision == CM_Interleave);
3415+
return (
3416+
WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse ||
3417+
WideningDecision == CM_Strided || WideningDecision == CM_Interleave);
33713418
};
33723419

33733420
// Returns true if Ptr is the pointer operand of a memory access instruction
@@ -4205,7 +4252,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
42054252
[](const auto *R) { return Instruction::Select; })
42064253
.Case<VPWidenStoreRecipe>(
42074254
[](const auto *R) { return Instruction::Store; })
4208-
.Case<VPWidenLoadRecipe>(
4255+
.Case<VPWidenLoadRecipe, VPWidenStridedLoadRecipe>(
42094256
[](const auto *R) { return Instruction::Load; })
42104257
.Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
42114258
[](const auto *R) { return Instruction::Call; })
@@ -4304,6 +4351,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
43044351
case VPDef::VPWidenPointerInductionSC:
43054352
case VPDef::VPReductionPHISC:
43064353
case VPDef::VPInterleaveSC:
4354+
case VPDef::VPWidenStridedLoadSC:
43074355
case VPDef::VPWidenLoadEVLSC:
43084356
case VPDef::VPWidenLoadSC:
43094357
case VPDef::VPWidenStoreEVLSC:
@@ -5883,6 +5931,19 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
58835931
return Cost;
58845932
}
58855933

5934+
InstructionCost
5935+
LoopVectorizationCostModel::getStridedLoadStoreCost(Instruction *I,
5936+
ElementCount VF) const {
5937+
Type *ValTy = getLoadStoreType(I);
5938+
auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5939+
const Align Alignment = getLoadStoreAlignment(I);
5940+
const Value *Ptr = getLoadStorePointerOperand(I);
5941+
5942+
return TTI.getStridedMemoryOpCost(I->getOpcode(), VectorTy, Ptr,
5943+
Legal->isMaskRequired(I), Alignment,
5944+
CostKind, I);
5945+
}
5946+
58865947
std::optional<InstructionCost>
58875948
LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
58885949
ElementCount VF,
@@ -6202,6 +6263,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
62026263
"Expected consecutive stride.");
62036264
InstWidening Decision =
62046265
ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6266+
// Consider using strided load/store for consecutive reverse accesses to
6267+
// achieve more efficient memory operations.
6268+
if (ConsecutiveStride == -1 && stridedAccessCanBeWidened(&I, VF)) {
6269+
const InstructionCost StridedLoadStoreCost =
6270+
getStridedLoadStoreCost(&I, VF);
6271+
if (StridedLoadStoreCost < Cost) {
6272+
Decision = CM_Strided;
6273+
Cost = StridedLoadStoreCost;
6274+
StrideInfo[&I] = ConsecutiveStride;
6275+
}
6276+
}
62056277
setWideningDecision(&I, VF, Decision, Cost);
62066278
continue;
62076279
}
@@ -6853,6 +6925,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
68536925
return TTI::CastContextHint::Normal;
68546926

68556927
switch (getWideningDecision(I, VF)) {
6928+
// TODO: New CastContextHint for strided accesses.
6929+
case LoopVectorizationCostModel::CM_Strided:
68566930
case LoopVectorizationCostModel::CM_GatherScatter:
68576931
return TTI::CastContextHint::GatherScatter;
68586932
case LoopVectorizationCostModel::CM_Interleave:
@@ -8424,16 +8498,27 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
84248498
// reverse consecutive.
84258499
LoopVectorizationCostModel::InstWidening Decision =
84268500
CM.getWideningDecision(I, Range.Start);
8501+
8502+
auto SameWiden = [&](ElementCount VF) -> bool {
8503+
return Decision == CM.getWideningDecision(I, VF);
8504+
};
8505+
bool ContainsWidenVF =
8506+
LoopVectorizationPlanner::getDecisionAndClampRange(SameWiden, Range);
8507+
assert(ContainsWidenVF &&
8508+
"At least widen the memory accesses by the Start VF.");
8509+
84278510
bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
84288511
bool Consecutive =
84298512
Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8513+
bool Strided = Decision == LoopVectorizationCostModel::CM_Strided;
84308514

84318515
VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8432-
if (Consecutive) {
8516+
if (Consecutive || Strided) {
84338517
auto *GEP = dyn_cast<GetElementPtrInst>(
84348518
Ptr->getUnderlyingValue()->stripPointerCasts());
84358519
VPSingleDefRecipe *VectorPtr;
84368520
if (Reverse) {
8521+
assert(!Strided && "Reverse and Strided are mutually exclusive.");
84378522
// When folding the tail, we may compute an address that we don't in the
84388523
// original scalar loop and it may not be inbounds. Drop Inbounds in that
84398524
// case.
@@ -8444,17 +8529,30 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
84448529
VectorPtr = new VPVectorEndPointerRecipe(
84458530
Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc());
84468531
} else {
8447-
VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
8532+
VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), Strided,
84488533
GEP ? GEP->getNoWrapFlags()
84498534
: GEPNoWrapFlags::none(),
84508535
I->getDebugLoc());
84518536
}
84528537
Builder.insert(VectorPtr);
84538538
Ptr = VectorPtr;
84548539
}
8455-
if (LoadInst *Load = dyn_cast<LoadInst>(I))
8540+
if (LoadInst *Load = dyn_cast<LoadInst>(I)) {
8541+
if (Strided) {
8542+
const DataLayout &DL = Load->getDataLayout();
8543+
auto *StrideTy = DL.getIndexType(Load->getPointerOperand()->getType());
8544+
int64_t Stride = CM.getStride(Load);
8545+
assert(Stride == -1 &&
8546+
"Only stride memory access with a stride of -1 is supported.");
8547+
VPValue *StrideVPV = Plan.getOrAddLiveIn(ConstantInt::get(
8548+
StrideTy, Stride * DL.getTypeAllocSize(getLoadStoreType(Load))));
8549+
return new VPWidenStridedLoadRecipe(*Load, Ptr, StrideVPV, &Plan.getVF(),
8550+
Mask, VPIRMetadata(*Load, LVer),
8551+
I->getDebugLoc());
8552+
}
84568553
return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
84578554
VPIRMetadata(*Load, LVer), I->getDebugLoc());
8555+
}
84588556

84598557
StoreInst *Store = cast<StoreInst>(I);
84608558
return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 62 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -542,6 +542,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
542542
case VPRecipeBase::VPBranchOnMaskSC:
543543
case VPRecipeBase::VPInterleaveSC:
544544
case VPRecipeBase::VPIRInstructionSC:
545+
case VPRecipeBase::VPWidenStridedLoadSC:
545546
case VPRecipeBase::VPWidenLoadEVLSC:
546547
case VPRecipeBase::VPWidenLoadSC:
547548
case VPRecipeBase::VPWidenStoreEVLSC:
@@ -1714,16 +1715,21 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
17141715
};
17151716

17161717
/// A recipe to compute the pointers for widened memory accesses of IndexTy.
1718+
/// Supports both consecutive and reverse consecutive accesses.
1719+
/// TODO: Support non-unit strided accesses .
17171720
class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
17181721
public VPUnrollPartAccessor<1> {
17191722
Type *IndexedTy;
17201723

1724+
/// Indicate whether to compute the pointer for strided memory accesses.
1725+
bool Strided;
1726+
17211727
public:
1722-
VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, GEPNoWrapFlags GEPFlags,
1723-
DebugLoc DL)
1728+
VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, bool Strided,
1729+
GEPNoWrapFlags GEPFlags, DebugLoc DL)
17241730
: VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef<VPValue *>(Ptr),
17251731
GEPFlags, DL),
1726-
IndexedTy(IndexedTy) {}
1732+
IndexedTy(IndexedTy), Strided(Strided) {}
17271733

17281734
VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC)
17291735

@@ -1744,7 +1750,7 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
17441750
}
17451751

17461752
VPVectorPointerRecipe *clone() override {
1747-
return new VPVectorPointerRecipe(getOperand(0), IndexedTy,
1753+
return new VPVectorPointerRecipe(getOperand(0), IndexedTy, Strided,
17481754
getGEPNoWrapFlags(), getDebugLoc());
17491755
}
17501756

@@ -2740,7 +2746,8 @@ class VPWidenMemoryRecipe : public VPRecipeBase, public VPIRMetadata {
27402746
return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC ||
27412747
R->getVPDefID() == VPRecipeBase::VPWidenStoreSC ||
27422748
R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC ||
2743-
R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC;
2749+
R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC ||
2750+
R->getVPDefID() == VPRecipeBase::VPWidenStridedLoadSC;
27442751
}
27452752

27462753
static inline bool classof(const VPUser *U) {
@@ -2859,6 +2866,56 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
28592866
}
28602867
};
28612868

2869+
/// A recipe for strided load operations, using the base address, stride, and an
2870+
/// optional mask. This recipe will generate an vp.strided.load intrinsic call
2871+
/// to represent memory accesses with a fixed stride.
2872+
struct VPWidenStridedLoadRecipe final : public VPWidenMemoryRecipe,
2873+
public VPValue {
2874+
VPWidenStridedLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Stride,
2875+
VPValue *VF, VPValue *Mask,
2876+
const VPIRMetadata &Metadata, DebugLoc DL)
2877+
: VPWidenMemoryRecipe(
2878+
VPDef::VPWidenStridedLoadSC, Load, {Addr, Stride, VF},
2879+
/*Consecutive=*/false, /*Reverse=*/false, Metadata, DL),
2880+
VPValue(this, &Load) {
2881+
setMask(Mask);
2882+
}
2883+
2884+
VPWidenStridedLoadRecipe *clone() override {
2885+
return new VPWidenStridedLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
2886+
getStride(), getVF(), getMask(), *this,
2887+
getDebugLoc());
2888+
}
2889+
2890+
VP_CLASSOF_IMPL(VPDef::VPWidenStridedLoadSC);
2891+
2892+
/// Return the stride operand.
2893+
VPValue *getStride() const { return getOperand(1); }
2894+
2895+
/// Return the VF operand.
2896+
VPValue *getVF() const { return getOperand(2); }
2897+
2898+
/// Generate a strided load.
2899+
void execute(VPTransformState &State) override;
2900+
2901+
/// Return the cost of this VPWidenStridedLoadRecipe.
2902+
InstructionCost computeCost(ElementCount VF,
2903+
VPCostContext &Ctx) const override;
2904+
2905+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2906+
/// Print the recipe.
2907+
void print(raw_ostream &O, const Twine &Indent,
2908+
VPSlotTracker &SlotTracker) const override;
2909+
#endif
2910+
2911+
/// Returns true if the recipe only uses the first lane of operand \p Op.
2912+
bool onlyFirstLaneUsed(const VPValue *Op) const override {
2913+
assert(is_contained(operands(), Op) &&
2914+
"Op must be an operand of the recipe");
2915+
return Op == getAddr() || Op == getStride() || Op == getVF();
2916+
}
2917+
};
2918+
28622919
/// A recipe for widening store operations, using the stored value, the address
28632920
/// to store to and an optional mask.
28642921
struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe {

llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -160,8 +160,10 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {
160160
}
161161

162162
Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) {
163-
assert((isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(R)) &&
164-
"Store recipes should not define any values");
163+
assert(
164+
(isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe, VPWidenStridedLoadRecipe>(
165+
R)) &&
166+
"Store recipes should not define any values");
165167
return cast<LoadInst>(&R->getIngredient())->getType();
166168
}
167169

0 commit comments

Comments
 (0)