@@ -1092,6 +1092,7 @@ class LoopVectorizationCostModel {
1092
1092
CM_Widen_Reverse, // For consecutive accesses with stride -1.
1093
1093
CM_Interleave,
1094
1094
CM_GatherScatter,
1095
+ CM_Strided,
1095
1096
CM_Scalarize,
1096
1097
CM_VectorCall,
1097
1098
CM_IntrinsicCall
@@ -1325,6 +1326,20 @@ class LoopVectorizationCostModel {
1325
1326
return InterleaveInfo.getInterleaveGroup (Instr);
1326
1327
}
1327
1328
1329
+ // / Returns true if \p I is a memory instruction with strided memory access
1330
+ // / that can be vectorized.
1331
+ bool stridedAccessCanBeWidened (Instruction *I, ElementCount VF) const ;
1332
+
1333
+ // / Get the stride of the strided memory access instruction \p Instr. Return 0
1334
+ // / if the instruction \p Instr is not considered for vectorization as a
1335
+ // / strided memory access.
1336
+ int64_t getStride (Instruction *Instr) const {
1337
+ auto It = StrideInfo.find (Instr);
1338
+ if (It != StrideInfo.end ())
1339
+ return It->second ;
1340
+ return 0 ;
1341
+ }
1342
+
1328
1343
// / Returns true if we're required to use a scalar epilogue for at least
1329
1344
// / the final iteration of the original loop.
1330
1345
bool requiresScalarEpilogue (bool IsVectorizing) const {
@@ -1579,6 +1594,10 @@ class LoopVectorizationCostModel {
1579
1594
// / element)
1580
1595
InstructionCost getUniformMemOpCost (Instruction *I, ElementCount VF);
1581
1596
1597
+ // / The cost computation for strided load/store instruction.
1598
+ InstructionCost getStridedLoadStoreCost (Instruction *I,
1599
+ ElementCount VF) const ;
1600
+
1582
1601
// / Estimate the overhead of scalarizing an instruction. This is a
1583
1602
// / convenience wrapper for the type-based getScalarizationOverhead API.
1584
1603
InstructionCost getScalarizationOverhead (Instruction *I,
@@ -1718,6 +1737,9 @@ class LoopVectorizationCostModel {
1718
1737
Ops, [this , VF](Value *V) { return this ->needsExtract (V, VF); }));
1719
1738
}
1720
1739
1740
+ // / The mapping of memory access instructions to their stride values.
1741
+ DenseMap<Instruction *, int64_t > StrideInfo;
1742
+
1721
1743
public:
1722
1744
// / The loop that we evaluate.
1723
1745
Loop *TheLoop;
@@ -3275,6 +3297,31 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
3275
3297
return true ;
3276
3298
}
3277
3299
3300
+ bool LoopVectorizationCostModel::stridedAccessCanBeWidened (
3301
+ Instruction *I, ElementCount VF) const {
3302
+ // Get and ensure we have a valid memory instruction.
3303
+ assert ((isa<LoadInst, StoreInst>(I)) && " Invalid memory instruction" );
3304
+
3305
+ // Only support strided access for vector VF.
3306
+ if (!VF.isVector ())
3307
+ return false ;
3308
+
3309
+ // FIXME: Remove this check for StoreInst after strided store is supported.
3310
+ if (isa<StoreInst>(I))
3311
+ return false ;
3312
+
3313
+ [[maybe_unused]] auto *Ptr = getLoadStorePointerOperand (I);
3314
+ auto *ScalarTy = getLoadStoreType (I);
3315
+ // TODO: Support non-unit-reverse strided accesses. Add stride analysis here
3316
+ // to ensure that the accessed addresses are evenly spaced apart by a fixed
3317
+ // stride.
3318
+ assert (Legal->isConsecutivePtr (ScalarTy, Ptr ) == -1 &&
3319
+ " Only supports strided accesses with a stride of -1" );
3320
+
3321
+ const Align Alignment = getLoadStoreAlignment (I);
3322
+ return TTI.isLegalStridedLoadStore (toVectorTy (ScalarTy, VF), Alignment);
3323
+ }
3324
+
3278
3325
void LoopVectorizationCostModel::collectLoopUniforms (ElementCount VF) {
3279
3326
// We should not collect Uniforms more than once per VF. Right now,
3280
3327
// this function is called from collectUniformsAndScalars(), which
@@ -3365,9 +3412,9 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3365
3412
if (IsUniformMemOpUse (I))
3366
3413
return true ;
3367
3414
3368
- return (WideningDecision == CM_Widen ||
3369
- WideningDecision == CM_Widen_Reverse ||
3370
- WideningDecision == CM_Interleave);
3415
+ return (
3416
+ WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse ||
3417
+ WideningDecision == CM_Strided || WideningDecision == CM_Interleave);
3371
3418
};
3372
3419
3373
3420
// Returns true if Ptr is the pointer operand of a memory access instruction
@@ -4205,7 +4252,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
4205
4252
[](const auto *R) { return Instruction::Select; })
4206
4253
.Case <VPWidenStoreRecipe>(
4207
4254
[](const auto *R) { return Instruction::Store; })
4208
- .Case <VPWidenLoadRecipe>(
4255
+ .Case <VPWidenLoadRecipe, VPWidenStridedLoadRecipe >(
4209
4256
[](const auto *R) { return Instruction::Load; })
4210
4257
.Case <VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
4211
4258
[](const auto *R) { return Instruction::Call; })
@@ -4304,6 +4351,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4304
4351
case VPDef::VPWidenPointerInductionSC:
4305
4352
case VPDef::VPReductionPHISC:
4306
4353
case VPDef::VPInterleaveSC:
4354
+ case VPDef::VPWidenStridedLoadSC:
4307
4355
case VPDef::VPWidenLoadEVLSC:
4308
4356
case VPDef::VPWidenLoadSC:
4309
4357
case VPDef::VPWidenStoreEVLSC:
@@ -5883,6 +5931,19 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5883
5931
return Cost;
5884
5932
}
5885
5933
5934
+ InstructionCost
5935
+ LoopVectorizationCostModel::getStridedLoadStoreCost (Instruction *I,
5936
+ ElementCount VF) const {
5937
+ Type *ValTy = getLoadStoreType (I);
5938
+ auto *VectorTy = cast<VectorType>(toVectorTy (ValTy, VF));
5939
+ const Align Alignment = getLoadStoreAlignment (I);
5940
+ const Value *Ptr = getLoadStorePointerOperand (I);
5941
+
5942
+ return TTI.getStridedMemoryOpCost (I->getOpcode (), VectorTy, Ptr ,
5943
+ Legal->isMaskRequired (I), Alignment,
5944
+ CostKind, I);
5945
+ }
5946
+
5886
5947
std::optional<InstructionCost>
5887
5948
LoopVectorizationCostModel::getReductionPatternCost (Instruction *I,
5888
5949
ElementCount VF,
@@ -6202,6 +6263,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6202
6263
" Expected consecutive stride." );
6203
6264
InstWidening Decision =
6204
6265
ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6266
+ // Consider using strided load/store for consecutive reverse accesses to
6267
+ // achieve more efficient memory operations.
6268
+ if (ConsecutiveStride == -1 && stridedAccessCanBeWidened (&I, VF)) {
6269
+ const InstructionCost StridedLoadStoreCost =
6270
+ getStridedLoadStoreCost (&I, VF);
6271
+ if (StridedLoadStoreCost < Cost) {
6272
+ Decision = CM_Strided;
6273
+ Cost = StridedLoadStoreCost;
6274
+ StrideInfo[&I] = ConsecutiveStride;
6275
+ }
6276
+ }
6205
6277
setWideningDecision (&I, VF, Decision, Cost);
6206
6278
continue ;
6207
6279
}
@@ -6853,6 +6925,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6853
6925
return TTI::CastContextHint::Normal;
6854
6926
6855
6927
switch (getWideningDecision (I, VF)) {
6928
+ // TODO: New CastContextHint for strided accesses.
6929
+ case LoopVectorizationCostModel::CM_Strided:
6856
6930
case LoopVectorizationCostModel::CM_GatherScatter:
6857
6931
return TTI::CastContextHint::GatherScatter;
6858
6932
case LoopVectorizationCostModel::CM_Interleave:
@@ -8424,16 +8498,27 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
8424
8498
// reverse consecutive.
8425
8499
LoopVectorizationCostModel::InstWidening Decision =
8426
8500
CM.getWideningDecision (I, Range.Start );
8501
+
8502
+ auto SameWiden = [&](ElementCount VF) -> bool {
8503
+ return Decision == CM.getWideningDecision (I, VF);
8504
+ };
8505
+ bool ContainsWidenVF =
8506
+ LoopVectorizationPlanner::getDecisionAndClampRange (SameWiden, Range);
8507
+ assert (ContainsWidenVF &&
8508
+ " At least widen the memory accesses by the Start VF." );
8509
+
8427
8510
bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8428
8511
bool Consecutive =
8429
8512
Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8513
+ bool Strided = Decision == LoopVectorizationCostModel::CM_Strided;
8430
8514
8431
8515
VPValue *Ptr = isa<LoadInst>(I) ? Operands[0 ] : Operands[1 ];
8432
- if (Consecutive) {
8516
+ if (Consecutive || Strided ) {
8433
8517
auto *GEP = dyn_cast<GetElementPtrInst>(
8434
8518
Ptr ->getUnderlyingValue ()->stripPointerCasts ());
8435
8519
VPSingleDefRecipe *VectorPtr;
8436
8520
if (Reverse) {
8521
+ assert (!Strided && " Reverse and Strided are mutually exclusive." );
8437
8522
// When folding the tail, we may compute an address that we don't in the
8438
8523
// original scalar loop and it may not be inbounds. Drop Inbounds in that
8439
8524
// case.
@@ -8444,17 +8529,30 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
8444
8529
VectorPtr = new VPVectorEndPointerRecipe (
8445
8530
Ptr , &Plan.getVF (), getLoadStoreType (I), Flags, I->getDebugLoc ());
8446
8531
} else {
8447
- VectorPtr = new VPVectorPointerRecipe (Ptr , getLoadStoreType (I),
8532
+ VectorPtr = new VPVectorPointerRecipe (Ptr , getLoadStoreType (I), Strided,
8448
8533
GEP ? GEP->getNoWrapFlags ()
8449
8534
: GEPNoWrapFlags::none (),
8450
8535
I->getDebugLoc ());
8451
8536
}
8452
8537
Builder.insert (VectorPtr);
8453
8538
Ptr = VectorPtr;
8454
8539
}
8455
- if (LoadInst *Load = dyn_cast<LoadInst>(I))
8540
+ if (LoadInst *Load = dyn_cast<LoadInst>(I)) {
8541
+ if (Strided) {
8542
+ const DataLayout &DL = Load->getDataLayout ();
8543
+ auto *StrideTy = DL.getIndexType (Load->getPointerOperand ()->getType ());
8544
+ int64_t Stride = CM.getStride (Load);
8545
+ assert (Stride == -1 &&
8546
+ " Only stride memory access with a stride of -1 is supported." );
8547
+ VPValue *StrideVPV = Plan.getOrAddLiveIn (ConstantInt::get (
8548
+ StrideTy, Stride * DL.getTypeAllocSize (getLoadStoreType (Load))));
8549
+ return new VPWidenStridedLoadRecipe (*Load, Ptr , StrideVPV, &Plan.getVF (),
8550
+ Mask, VPIRMetadata (*Load, LVer),
8551
+ I->getDebugLoc ());
8552
+ }
8456
8553
return new VPWidenLoadRecipe (*Load, Ptr , Mask, Consecutive, Reverse,
8457
8554
VPIRMetadata (*Load, LVer), I->getDebugLoc ());
8555
+ }
8458
8556
8459
8557
StoreInst *Store = cast<StoreInst>(I);
8460
8558
return new VPWidenStoreRecipe (*Store, Ptr , Operands[0 ], Mask, Consecutive,
0 commit comments