@@ -1092,6 +1092,7 @@ class LoopVectorizationCostModel {
1092
1092
CM_Widen_Reverse, // For consecutive accesses with stride -1.
1093
1093
CM_Interleave,
1094
1094
CM_GatherScatter,
1095
+ CM_Strided,
1095
1096
CM_Scalarize,
1096
1097
CM_VectorCall,
1097
1098
CM_IntrinsicCall
@@ -1325,6 +1326,20 @@ class LoopVectorizationCostModel {
1325
1326
return InterleaveInfo.getInterleaveGroup(Instr);
1326
1327
}
1327
1328
1329
+ /// Returns true if \p I is a memory instruction with strided memory access
1330
+ /// that can be vectorized.
1331
+ bool stridedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
1332
+
1333
+ /// Get the stride of the strided memory access instruction \p Instr. Return 0
1334
+ /// if the instruction \p Instr is not considered for vectorization as a
1335
+ /// strided memory access.
1336
+ int64_t getStride(Instruction *Instr) const {
1337
+ auto It = StrideInfo.find(Instr);
1338
+ if (It != StrideInfo.end())
1339
+ return It->second;
1340
+ return 0;
1341
+ }
1342
+
1328
1343
/// Returns true if we're required to use a scalar epilogue for at least
1329
1344
/// the final iteration of the original loop.
1330
1345
bool requiresScalarEpilogue(bool IsVectorizing) const {
@@ -1579,6 +1594,10 @@ class LoopVectorizationCostModel {
1579
1594
/// element)
1580
1595
InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1581
1596
1597
+ /// The cost computation for strided load/store instruction.
1598
+ InstructionCost getStridedLoadStoreCost(Instruction *I,
1599
+ ElementCount VF) const;
1600
+
1582
1601
/// Estimate the overhead of scalarizing an instruction. This is a
1583
1602
/// convenience wrapper for the type-based getScalarizationOverhead API.
1584
1603
InstructionCost getScalarizationOverhead(Instruction *I,
@@ -1718,6 +1737,9 @@ class LoopVectorizationCostModel {
1718
1737
Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1719
1738
}
1720
1739
1740
+ /// The mapping of memory access instructions to their stride values.
1741
+ DenseMap<Instruction *, int64_t> StrideInfo;
1742
+
1721
1743
public:
1722
1744
/// The loop that we evaluate.
1723
1745
Loop *TheLoop;
@@ -3275,6 +3297,31 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
3275
3297
return true;
3276
3298
}
3277
3299
3300
+ bool LoopVectorizationCostModel::stridedAccessCanBeWidened(
3301
+ Instruction *I, ElementCount VF) const {
3302
+ // Get and ensure we have a valid memory instruction.
3303
+ assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3304
+
3305
+ // Only support strided access for vector VF.
3306
+ if (!VF.isVector())
3307
+ return false;
3308
+
3309
+ // FIXME: Remove this check for StoreInst after strided store is supported.
3310
+ if (isa<StoreInst>(I))
3311
+ return false;
3312
+
3313
+ [[maybe_unused]] auto *Ptr = getLoadStorePointerOperand(I);
3314
+ auto *ScalarTy = getLoadStoreType(I);
3315
+ // TODO: Support non-unit-reverse strided accesses. Add stride analysis here
3316
+ // to ensure that the accessed addresses are evenly spaced apart by a fixed
3317
+ // stride.
3318
+ assert(Legal->isConsecutivePtr(ScalarTy, Ptr) == -1 &&
3319
+ "Only supports strided accesses with a stride of -1");
3320
+
3321
+ const Align Alignment = getLoadStoreAlignment(I);
3322
+ return TTI.isLegalStridedLoadStore(toVectorTy(ScalarTy, VF), Alignment);
3323
+ }
3324
+
3278
3325
void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3279
3326
// We should not collect Uniforms more than once per VF. Right now,
3280
3327
// this function is called from collectUniformsAndScalars(), which
@@ -3365,9 +3412,9 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3365
3412
if (IsUniformMemOpUse(I))
3366
3413
return true;
3367
3414
3368
- return (WideningDecision == CM_Widen ||
3369
- WideningDecision == CM_Widen_Reverse ||
3370
- WideningDecision == CM_Interleave);
3415
+ return (
3416
+ WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse ||
3417
+ WideningDecision == CM_Strided || WideningDecision == CM_Interleave);
3371
3418
};
3372
3419
3373
3420
// Returns true if Ptr is the pointer operand of a memory access instruction
@@ -4205,7 +4252,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
4205
4252
[](const auto *R) { return Instruction::Select; })
4206
4253
.Case<VPWidenStoreRecipe>(
4207
4254
[](const auto *R) { return Instruction::Store; })
4208
- .Case<VPWidenLoadRecipe>(
4255
+ .Case<VPWidenLoadRecipe, VPWidenStridedLoadRecipe >(
4209
4256
[](const auto *R) { return Instruction::Load; })
4210
4257
.Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
4211
4258
[](const auto *R) { return Instruction::Call; })
@@ -4304,6 +4351,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4304
4351
case VPDef::VPWidenPointerInductionSC:
4305
4352
case VPDef::VPReductionPHISC:
4306
4353
case VPDef::VPInterleaveSC:
4354
+ case VPDef::VPWidenStridedLoadSC:
4307
4355
case VPDef::VPWidenLoadEVLSC:
4308
4356
case VPDef::VPWidenLoadSC:
4309
4357
case VPDef::VPWidenStoreEVLSC:
@@ -5884,6 +5932,19 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5884
5932
return Cost;
5885
5933
}
5886
5934
5935
+ InstructionCost
5936
+ LoopVectorizationCostModel::getStridedLoadStoreCost(Instruction *I,
5937
+ ElementCount VF) const {
5938
+ Type *ValTy = getLoadStoreType(I);
5939
+ auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5940
+ const Align Alignment = getLoadStoreAlignment(I);
5941
+ const Value *Ptr = getLoadStorePointerOperand(I);
5942
+
5943
+ return TTI.getStridedMemoryOpCost(I->getOpcode(), VectorTy, Ptr,
5944
+ Legal->isMaskRequired(I), Alignment,
5945
+ CostKind, I);
5946
+ }
5947
+
5887
5948
std::optional<InstructionCost>
5888
5949
LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
5889
5950
ElementCount VF,
@@ -6203,6 +6264,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6203
6264
"Expected consecutive stride.");
6204
6265
InstWidening Decision =
6205
6266
ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6267
+ // Consider using strided load/store for consecutive reverse accesses to
6268
+ // achieve more efficient memory operations.
6269
+ if (ConsecutiveStride == -1 && stridedAccessCanBeWidened(&I, VF)) {
6270
+ const InstructionCost StridedLoadStoreCost =
6271
+ getStridedLoadStoreCost(&I, VF);
6272
+ if (StridedLoadStoreCost < Cost) {
6273
+ Decision = CM_Strided;
6274
+ Cost = StridedLoadStoreCost;
6275
+ StrideInfo[&I] = ConsecutiveStride;
6276
+ }
6277
+ }
6206
6278
setWideningDecision(&I, VF, Decision, Cost);
6207
6279
continue;
6208
6280
}
@@ -6854,6 +6926,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6854
6926
return TTI::CastContextHint::Normal;
6855
6927
6856
6928
switch (getWideningDecision(I, VF)) {
6929
+ // TODO: New CastContextHint for strided accesses.
6930
+ case LoopVectorizationCostModel::CM_Strided:
6857
6931
case LoopVectorizationCostModel::CM_GatherScatter:
6858
6932
return TTI::CastContextHint::GatherScatter;
6859
6933
case LoopVectorizationCostModel::CM_Interleave:
@@ -8425,16 +8499,27 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
8425
8499
// reverse consecutive.
8426
8500
LoopVectorizationCostModel::InstWidening Decision =
8427
8501
CM.getWideningDecision(I, Range.Start);
8502
+
8503
+ auto SameWiden = [&](ElementCount VF) -> bool {
8504
+ return Decision == CM.getWideningDecision(I, VF);
8505
+ };
8506
+ bool ContainsWidenVF =
8507
+ LoopVectorizationPlanner::getDecisionAndClampRange(SameWiden, Range);
8508
+ assert(ContainsWidenVF &&
8509
+ "At least widen the memory accesses by the Start VF.");
8510
+
8428
8511
bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8429
8512
bool Consecutive =
8430
8513
Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8514
+ bool Strided = Decision == LoopVectorizationCostModel::CM_Strided;
8431
8515
8432
8516
VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8433
- if (Consecutive) {
8517
+ if (Consecutive || Strided ) {
8434
8518
auto *GEP = dyn_cast<GetElementPtrInst>(
8435
8519
Ptr->getUnderlyingValue()->stripPointerCasts());
8436
8520
VPSingleDefRecipe *VectorPtr;
8437
8521
if (Reverse) {
8522
+ assert(!Strided && "Reverse and Strided are mutually exclusive.");
8438
8523
// When folding the tail, we may compute an address that we don't in the
8439
8524
// original scalar loop and it may not be inbounds. Drop Inbounds in that
8440
8525
// case.
@@ -8445,17 +8530,30 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
8445
8530
VectorPtr = new VPVectorEndPointerRecipe(
8446
8531
Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc());
8447
8532
} else {
8448
- VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
8533
+ VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), Strided,
8449
8534
GEP ? GEP->getNoWrapFlags()
8450
8535
: GEPNoWrapFlags::none(),
8451
8536
I->getDebugLoc());
8452
8537
}
8453
8538
Builder.insert(VectorPtr);
8454
8539
Ptr = VectorPtr;
8455
8540
}
8456
- if (LoadInst *Load = dyn_cast<LoadInst>(I))
8541
+ if (LoadInst *Load = dyn_cast<LoadInst>(I)) {
8542
+ if (Strided) {
8543
+ const DataLayout &DL = Load->getDataLayout();
8544
+ auto *StrideTy = DL.getIndexType(Load->getPointerOperand()->getType());
8545
+ int64_t Stride = CM.getStride(Load);
8546
+ assert(Stride == -1 &&
8547
+ "Only stride memory access with a stride of -1 is supported.");
8548
+ VPValue *StrideVPV = Plan.getOrAddLiveIn(ConstantInt::get(
8549
+ StrideTy, Stride * DL.getTypeAllocSize(getLoadStoreType(Load))));
8550
+ return new VPWidenStridedLoadRecipe(*Load, Ptr, StrideVPV, &Plan.getVF(),
8551
+ Mask, VPIRMetadata(*Load, LVer),
8552
+ I->getDebugLoc());
8553
+ }
8457
8554
return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
8458
8555
VPIRMetadata(*Load, LVer), I->getDebugLoc());
8556
+ }
8459
8557
8460
8558
StoreInst *Store = cast<StoreInst>(I);
8461
8559
return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
0 commit comments