-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[LV] Support strided load with a stride of -1 #128718
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: Mel Chen (Mel-Chen) ChangesPatch is 118.45 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/128718.diff 9 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8c41f896ad622..6635a91563384 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1143,6 +1143,7 @@ class LoopVectorizationCostModel {
CM_Widen_Reverse, // For consecutive accesses with stride -1.
CM_Interleave,
CM_GatherScatter,
+ CM_Strided,
CM_Scalarize,
CM_VectorCall,
CM_IntrinsicCall
@@ -1308,6 +1309,18 @@ class LoopVectorizationCostModel {
(SI && TTI.isLegalMaskedScatter(Ty, Align));
}
+ /// Returns true if the target machine can represent \p V as a strided load
+ /// or store operation.
+ bool isLegalStridedLoadStore(Value *V, ElementCount VF) {
+ if (!isa<LoadInst, StoreInst>(V))
+ return false;
+ auto *Ty = getLoadStoreType(V);
+ Align Align = getLoadStoreAlignment(V);
+ if (VF.isVector())
+ Ty = VectorType::get(Ty, VF);
+ return TTI.isLegalStridedLoadStore(Ty, Align);
+ }
+
/// Returns true if the target machine supports all of the reduction
/// variables found for the given VF.
bool canVectorizeReductions(ElementCount VF) const {
@@ -1645,6 +1658,9 @@ class LoopVectorizationCostModel {
/// element)
InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
+ /// The cost computation for strided load/store instruction.
+ InstructionCost getStridedLoadStoreCost(Instruction *I, ElementCount VF);
+
/// Estimate the overhead of scalarizing an instruction. This is a
/// convenience wrapper for the type-based getScalarizationOverhead API.
InstructionCost getScalarizationOverhead(Instruction *I,
@@ -5813,6 +5829,19 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
return Cost;
}
+InstructionCost
+LoopVectorizationCostModel::getStridedLoadStoreCost(Instruction *I,
+ ElementCount VF) {
+ Type *ValTy = getLoadStoreType(I);
+ auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
+ const Align Alignment = getLoadStoreAlignment(I);
+ const Value *Ptr = getLoadStorePointerOperand(I);
+
+ return TTI.getStridedMemoryOpCost(I->getOpcode(), VectorTy, Ptr,
+ Legal->isMaskRequired(I), Alignment,
+ CostKind, I);
+}
+
std::optional<InstructionCost>
LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
ElementCount VF,
@@ -6132,6 +6161,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
"Expected consecutive stride.");
InstWidening Decision =
ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
+ // Consider using strided load/store for consecutive reverse accesses to
+ // achieve more efficient memory operations.
+ if (ConsecutiveStride == -1) {
+ const InstructionCost StridedLoadStoreCost =
+ isLegalStridedLoadStore(&I, VF) ? getStridedLoadStoreCost(&I, VF)
+ : InstructionCost::getInvalid();
+ if (StridedLoadStoreCost < Cost) {
+ Decision = CM_Strided;
+ Cost = StridedLoadStoreCost;
+ }
+ }
setWideningDecision(&I, VF, Decision, Cost);
continue;
}
@@ -6777,6 +6817,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
return TTI::CastContextHint::Normal;
switch (getWideningDecision(I, VF)) {
+ // TODO: New CastContextHint for strided accesses.
+ case LoopVectorizationCostModel::CM_Strided:
case LoopVectorizationCostModel::CM_GatherScatter:
return TTI::CastContextHint::GatherScatter;
case LoopVectorizationCostModel::CM_Interleave:
@@ -8335,6 +8377,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
bool Consecutive =
Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
+ bool Strided = Decision == LoopVectorizationCostModel::CM_Strided;
VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
if (Consecutive) {
@@ -8362,11 +8405,11 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
}
if (LoadInst *Load = dyn_cast<LoadInst>(I))
return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
- I->getDebugLoc());
+ Strided, I->getDebugLoc());
StoreInst *Store = cast<StoreInst>(I);
return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
- Reverse, I->getDebugLoc());
+ Reverse, Strided, I->getDebugLoc());
}
/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 8089cfd1ce802..34b4c2c8a0be1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2611,6 +2611,9 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
/// Whether the consecutive accessed addresses are in reverse order.
bool Reverse;
+ /// Whether the accessed addresses are evenly spaced apart by a fixed stride.
+ bool Strided;
+
/// Whether the memory access is masked.
bool IsMasked = false;
@@ -2624,9 +2627,9 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
VPWidenMemoryRecipe(const char unsigned SC, Instruction &I,
std::initializer_list<VPValue *> Operands,
- bool Consecutive, bool Reverse, DebugLoc DL)
+ bool Consecutive, bool Reverse, bool Strided, DebugLoc DL)
: VPRecipeBase(SC, Operands, DL), Ingredient(I), Consecutive(Consecutive),
- Reverse(Reverse) {
+ Reverse(Reverse), Strided(Strided) {
assert((Consecutive || !Reverse) && "Reverse implies consecutive");
}
@@ -2654,6 +2657,10 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
/// order.
bool isReverse() const { return Reverse; }
+ /// Return whether the accessed addresses are evenly spaced apart by a fixed
+ /// stride.
+ bool isStrided() const { return Strided; }
+
/// Return the address accessed by this recipe.
VPValue *getAddr() const { return getOperand(0); }
@@ -2683,16 +2690,16 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
/// optional mask.
struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
- bool Consecutive, bool Reverse, DebugLoc DL)
+ bool Consecutive, bool Reverse, bool Strided, DebugLoc DL)
: VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive,
- Reverse, DL),
+ Reverse, Strided, DL),
VPValue(this, &Load) {
setMask(Mask);
}
VPWidenLoadRecipe *clone() override {
return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
- getMask(), Consecutive, Reverse,
+ getMask(), Consecutive, Reverse, Strided,
getDebugLoc());
}
@@ -2724,7 +2731,7 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue &EVL, VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(),
{L.getAddr(), &EVL}, L.isConsecutive(),
- L.isReverse(), L.getDebugLoc()),
+ L.isReverse(), L.isStrided(), L.getDebugLoc()),
VPValue(this, &getIngredient()) {
setMask(Mask);
}
@@ -2761,16 +2768,17 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
/// to store to and an optional mask.
struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal,
- VPValue *Mask, bool Consecutive, bool Reverse, DebugLoc DL)
+ VPValue *Mask, bool Consecutive, bool Reverse,
+ bool Strided, DebugLoc DL)
: VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal},
- Consecutive, Reverse, DL) {
+ Consecutive, Reverse, Strided, DL) {
setMask(Mask);
}
VPWidenStoreRecipe *clone() override {
return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(),
getStoredValue(), getMask(), Consecutive,
- Reverse, getDebugLoc());
+ Reverse, Strided, getDebugLoc());
}
VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC);
@@ -2804,7 +2812,8 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue &EVL, VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
{S.getAddr(), S.getStoredValue(), &EVL},
- S.isConsecutive(), S.isReverse(), S.getDebugLoc()) {
+ S.isConsecutive(), S.isReverse(), S.isStrided(),
+ S.getDebugLoc()) {
setMask(Mask);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index d57a6c481748c..e62ae690caa6d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2609,10 +2609,15 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
assert(!Reverse &&
"Inconsecutive memory access should not have the order.");
- return Ctx.TTI.getAddressComputationCost(Ty) +
- Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
- IsMasked, Alignment, Ctx.CostKind,
- &Ingredient);
+ if (Strided)
+ return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty, Ptr,
+ IsMasked, Alignment, Ctx.CostKind,
+ &Ingredient);
+ else
+ return Ctx.TTI.getAddressComputationCost(Ty) +
+ Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
+ IsMasked, Alignment, Ctx.CostKind,
+ &Ingredient);
}
InstructionCost Cost = 0;
@@ -2639,11 +2644,13 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
const Align Alignment = getLoadStoreAlignment(&Ingredient);
- bool CreateGather = !isConsecutive();
+ bool CreateGather = !isConsecutive() && !isStrided();
auto &Builder = State.Builder;
State.setDebugLocFrom(getDebugLoc());
- Value *Mask = nullptr;
+ Value *Mask = isStrided()
+ ? Builder.CreateVectorSplat(State.VF, Builder.getTrue())
+ : nullptr;
if (auto *VPMask = getMask()) {
// Mask reversal is only needed for non-all-one (null) masks, as reverse
// of a null all-one mask is a null mask.
@@ -2658,9 +2665,25 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
"wide.masked.gather");
} else if (Mask) {
- NewLI =
- Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
- PoisonValue::get(DataTy), "wide.masked.load");
+ if (isStrided()) {
+ const DataLayout &DL = LI->getDataLayout();
+ auto *PtrTy = Addr->getType();
+ auto *StrideTy = DL.getIndexType(PtrTy);
+ // TODO: Support non-unit-reverse strided accesses.
+ auto *StrideVal =
+ ConstantInt::get(StrideTy, -1 * DL.getTypeAllocSize(ScalarDataTy));
+ Value *RuntimeVF =
+ getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
+ NewLI = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy},
+ {Addr, StrideVal, Mask, RuntimeVF}, nullptr, "wide.strided.load");
+ cast<CallInst>(NewLI)->addParamAttr(
+ 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
+ } else {
+ NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
+ PoisonValue::get(DataTy),
+ "wide.masked.load");
+ }
} else {
NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
}
@@ -2698,7 +2721,7 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
const Align Alignment = getLoadStoreAlignment(&Ingredient);
- bool CreateGather = !isConsecutive();
+ bool CreateGather = !isConsecutive() && !isStrided();
auto &Builder = State.Builder;
State.setDebugLocFrom(getDebugLoc());
@@ -2718,6 +2741,16 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
NewLI =
Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
nullptr, "wide.masked.gather");
+ } else if (isStrided()) {
+ const DataLayout &DL = LI->getDataLayout();
+ auto *PtrTy = Addr->getType();
+ auto *StrideTy = DL.getIndexType(PtrTy);
+ // TODO: Support non-unit-reverse strided accesses.
+ auto *StrideVal =
+ ConstantInt::get(StrideTy, -1 * DL.getTypeAllocSize(ScalarDataTy));
+ NewLI = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy},
+ {Addr, StrideVal, Mask, EVL}, nullptr, "wide.strided.load");
} else {
VectorBuilder VBuilder(Builder);
VBuilder.setEVL(EVL).setMask(Mask);
@@ -2772,13 +2805,15 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
auto *SI = cast<StoreInst>(&Ingredient);
VPValue *StoredVPValue = getStoredValue();
- bool CreateScatter = !isConsecutive();
+ bool CreateScatter = !isConsecutive() && !isStrided();
const Align Alignment = getLoadStoreAlignment(&Ingredient);
auto &Builder = State.Builder;
State.setDebugLocFrom(getDebugLoc());
- Value *Mask = nullptr;
+ Value *Mask = isStrided()
+ ? Builder.CreateVectorSplat(State.VF, Builder.getTrue())
+ : nullptr;
if (auto *VPMask = getMask()) {
// Mask reversal is only needed for non-all-one (null) masks, as reverse
// of a null all-one mask is a null mask.
@@ -2797,12 +2832,32 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
}
Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
Instruction *NewSI = nullptr;
- if (CreateScatter)
+ if (CreateScatter) {
NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
- else if (Mask)
- NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
- else
+ } else if (Mask) {
+ if (isStrided()) {
+ const DataLayout &DL = SI->getDataLayout();
+ auto *StoredVecTy = cast<VectorType>(StoredVal->getType());
+ Type *StoredEltTy = StoredVecTy->getElementType();
+ auto *PtrTy = Addr->getType();
+ auto *StrideTy = DL.getIndexType(PtrTy);
+ // TODO: Support non-unit-reverse strided accesses.
+ auto *StrideVal =
+ ConstantInt::get(StrideTy, -1 * DL.getTypeAllocSize(StoredEltTy));
+ Value *RuntimeVF =
+ getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
+ NewSI = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vp_strided_store,
+ {StoredVecTy, PtrTy, StrideTy},
+ {StoredVal, Addr, StrideVal, Mask, RuntimeVF});
+ cast<CallInst>(NewSI)->addParamAttr(
+ 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
+ } else {
+ NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
+ }
+ } else {
NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
+ }
State.addMetadata(NewSI, SI);
}
@@ -2818,7 +2873,7 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
auto *SI = cast<StoreInst>(&Ingredient);
VPValue *StoredValue = getStoredValue();
- bool CreateScatter = !isConsecutive();
+ bool CreateScatter = !isConsecutive() && !isStrided();
const Align Alignment = getLoadStoreAlignment(&Ingredient);
auto &Builder = State.Builder;
@@ -2843,11 +2898,25 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
Intrinsic::vp_scatter,
{StoredVal, Addr, Mask, EVL});
} else {
- VectorBuilder VBuilder(Builder);
- VBuilder.setEVL(EVL).setMask(Mask);
- NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
- Instruction::Store, Type::getVoidTy(EVL->getContext()),
- {StoredVal, Addr}));
+ if (isStrided()) {
+ const DataLayout &DL = SI->getDataLayout();
+ auto *StoredVecTy = cast<VectorType>(StoredVal->getType());
+ Type *StoredEltTy = StoredVecTy->getElementType();
+ auto *PtrTy = Addr->getType();
+ auto *StrideTy = DL.getIndexType(PtrTy);
+ // TODO: Support non-unit-reverse strided accesses.
+ auto *StrideVal =
+ ConstantInt::get(StrideTy, -1 * DL.getTypeAllocSize(StoredEltTy));
+ NewSI = Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_store,
+ {StoredVecTy, PtrTy, StrideTy},
+ {StoredVal, Addr, StrideVal, Mask, EVL});
+ } else {
+ VectorBuilder VBuilder(Builder);
+ VBuilder.setEVL(EVL).setMask(Mask);
+ NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
+ Instruction::Store, Type::getVoidTy(EVL->getContext()),
+ {StoredVal, Addr}));
+ }
}
NewSI->addParamAttr(
1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 6c917e4eef655..fd6984208a760 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -73,13 +73,13 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
NewRecipe = new VPWidenLoadRecipe(
*Load, Ingredient.getOperand(0), nullptr /*Mask*/,
- false /*Consecutive*/, false /*Reverse*/,
+ false /*Consecutive*/, false /*Reverse*/, false /*Strided*/,
Ingredient.getDebugLoc());
} else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
NewRecipe = new VPWidenStoreRecipe(
*Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
- Ingredient.getDebugLoc());
+ false /*Strided*/, Ingredient.getDebugLoc());
} else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
} else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
new file mode 100644
index 0000000000000..1aa01d7aabee9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
@@ -0,0 +1,642 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+
+;; This is the loop in c++ being vectorize in this file with
+;; vector.reverse
+;; #pragma clang loop vectorize_width(4, scalable)
+;; for (int i = N-1; i >= 0; --i)
+;; a[i] = b[i] ...
[truncated]
|
@@ -1308,6 +1309,18 @@ class LoopVectorizationCostModel { | |||
(SI && TTI.isLegalMaskedScatter(Ty, Align)); | |||
} | |||
|
|||
/// Returns true if the target machine can represent \p V as a strided load | |||
/// or store operation. | |||
bool isLegalStridedLoadStore(Value *V, ElementCount VF) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think better to have the format of the function, similar to isLegalGatherOrScatter, isLegalMaskedLoad etc.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I copy the function name from TTI function. Do you have any recommended names?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is not about the name, more about the function parameters
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The prototype of isLegalGatherOrScatter is:
bool isLegalGatherOrScatter(Value *V, ElementCount VF)
And the prototype of isLegalMaskedLoad is:
bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment)
isLegalStridedLoadStore is the same as isLegalGatherOrScatter now.
Do you prefer the prototype of isLegalMaskedLoad?
✅ With the latest revision this PR passed the C/C++ code formatter. |
1966fd5
to
c54369e
Compare
This patch is ready for review. Please take a look, thanks. |
@@ -1312,6 +1313,22 @@ class LoopVectorizationCostModel { | |||
(SI && TTI.isLegalMaskedScatter(Ty, Align)); | |||
} | |||
|
|||
/// Returns true if the target machine can represent \p V as a strided load | |||
/// or store operation. | |||
bool isLegalStridedLoadStore(Value *V, ElementCount VF) const { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm suggesting to use the prototype of isLegalMaskedLoad here, most of the function do the analysis based on type/alignment info
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After consideration, I have decided to rename isLegalStridedLoadStore
to stridedAccessCanBeWidened
and align its prototype with memoryInstructionCanBeWidened
and interleavedAccessCanBeWidened
.
The reason is that, unlike isLegalGatherOrScatter
and isLegalMaskedLoad
, which primarily check whether the target supports certain operations, strided memory access requires additional checks, such as check stride. Its functionality is likely closer to memoryInstructionCanBeWidened
and interleavedAccessCanBeWidened
.
// TODO: Support non-unit-reverse strided accesses. | ||
Value *Index = | ||
Strided | ||
? Builder.CreateMul(Increment, ConstantInt::getSigned(IndexTy, -1)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Strided
means negative stride? Would be good to clarify in the comment for Strided..
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just for now. Later it is supposed to support positive (and <-1 negative) and runtime stride values
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should Strided
eventually be some sort of std::optional<int>
? Or is the plan to also allow runtime strided values? In which case I presume we need to model it as a VPValue operand.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is the initial patch for strided memory accesses, currently only supporting cases where the stride is -1. A follow-up patch will focus on stride analysis to support more strided memory accesses.
However, I believe it would be beneficial to discuss and decide how VPlan should handle stride in this initial patch. My current plan is to extend the existing VPWidenLoadRecipe/VPWidenStoreRecipe
and VPVectorPointerRecipe
by explicitly passing the stride into the recipe. For example:
VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
VPValue *Stride,
bool Consecutive, bool Reverse, DebugLoc DL)
A memory access will only be considered strided when Stride
is explicitly provided (i.e., not nullptr).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think we should add another 'mode' to WidenLoad/StoreRecipe to make it even more complicated.
It would probably be good to have a separate recipe.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we want to support it for EVL tail folding further down the line would we also need to add an EVL + Strided load recipe?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think we should add another 'mode' to WidenLoad/StoreRecipe to make it even more complicated.
It would probably be good to have a separate recipe.
@fhahn I see. Let's first try adding VPStridedLoadRecipe for strided loads. What about VPVectorPointerRecipe? Can we modify it to:
VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, VPValue *Stride,
GEPNoWrapFlags GEPFlags, DebugLoc DL)
Or should we introduce a new recipe instead?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we want to support it for EVL tail folding further down the line would we also need to add an EVL + Strided load recipe?
@lukel97 I think we can pass VF into the new recipe, and replace VF with EVL, like VPVectorEndPointerRecipe did.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@Mel-Chen yes, that's a good idea. Since it will generate a vp intrinsic either way.
// TODO: Support non-unit-reverse strided accesses. | ||
Value *Index = | ||
Strided | ||
? Builder.CreateMul(Increment, ConstantInt::getSigned(IndexTy, -1)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should Strided
eventually be some sort of std::optional<int>
? Or is the plan to also allow runtime strided values? In which case I presume we need to model it as a VPValue operand.
…pe. NFC After llvm#128718 lands there will be two ways of performing a reversed widened memory access, either by performing a consecutive unit-stride access and a reverse, or a strided access with a negative stride. Even though both produce a reversed vector, only the former needs VPReverseVectorPointerRecipe which computes a pointer to the last element of the widened vector. A strided reverse still needs a pointer to the first element of each part so it will use VPVectorPointerRecipe. This renames VPReverseVectorPointerRecipe to VPVectorEndPointerRecipe to clarify that a reversed access may not necessarily need a pointer to the last element.
Duplicate riscv-vector-reverse.ll as riscv-vector-reverse-output.ll to verify all generated IR, not just debug output. Pre-commit for #128718.
…pe. NFC (#131086) After #128718 lands there will be two ways of performing a reversed widened memory access, either by performing a consecutive unit-stride access and a reverse, or a strided access with a negative stride. Even though both produce a reversed vector, only the former needs VPReverseVectorPointerRecipe which computes a pointer to the last element of each part. A strided reverse still needs a pointer to the first element of each part so it will use VPVectorPointerRecipe. This renames VPReverseVectorPointerRecipe to VPVectorEndPointerRecipe to clarify that a reversed access may not necessarily need a pointer to the last element.
8fe1f5e
to
a62c8f5
Compare
Changes:
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The new recipe LGTM
|
||
auto SameWiden = [&](ElementCount VF) -> bool { | ||
return Decision == CM.getWideningDecision(I, VF); | ||
}; | ||
bool ContainsWidenVF = | ||
LoopVectorizationPlanner::getDecisionAndClampRange(SameWiden, Range); | ||
assert(ContainsWidenVF && | ||
"At least widen the memory accesses by the Start VF."); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just to check my understanding, with this patch do we now have different VFs with different widening decisions that now need pruned? E.g. CM_Strided for VF=1, CM_WidenReverse for VF=2? Was this an issue beforehand?
Also is this assert still needed? I think this is only checking CM.getWideningDecision(I, Range.Start) == CM.getWideningDecision(I, Range.Start)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just to check my understanding, with this patch do we now have different VFs with different widening decisions that now need pruned? E.g. CM_Strided for VF=1, CM_WidenReverse for VF=2? Was this an issue beforehand?
Yes, currently, when the VF is smaller, the cost model prefers CM_Strided, while for larger VFs, it favors CM_WidenReverse. This leads to a mismatch between the legacy cost model and the VPlan-based cost model, which is a situation we haven't encountered before.
Also is this assert still needed? I think this is only checking
CM.getWideningDecision(I, Range.Start) == CM.getWideningDecision(I, Range.Start)
I'm also torn about whether we should keep this assertion. ContainsWidenVF is theoretically supposed to be true; if it's not, we've definitely broken something. :(
; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> | ||
; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> | ||
; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> | ||
; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = vp<[[VF]]> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am a bit surprised why this strides load with stride of 1 is more profitable than a consecutive load?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a comparison between strided load with stride -1 and reverse load. It could be due to the reverse operation being relatively expensive.
/// A recipe for strided load operations, using the base address, stride, and an | ||
/// optional mask. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Conceptually the strided load with stride of 1 seems like it would be the same as a wide load? Would be good to clarify the difference.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, you are right. Strided load can represent consecutive access, just like gather can also represent consecutive access. It's just that we usually don't choose the one with worse performance.
Updated the comment briefly. a240003
@@ -1115,6 +1115,7 @@ class LoopVectorizationCostModel { | |||
CM_Widen_Reverse, // For consecutive accesses with stride -1. | |||
CM_Interleave, | |||
CM_GatherScatter, | |||
CM_Strided, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we actually need all the legacy cost model changes? Could we just replace a VPWidenLoadRecipe (vector-end-pointer) -> VPStridedLoad
if profitable as VPlan-to-VPlan transformation?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Possible in its current state, but not sure it is possible in general. Some (potential) strided memory recipes are currently represented as masked gathers, which require widened pointers. Strided/interleaved mem accesses do not require widened (vector) pointers, so it may affect the cost significantly. I think we need to teach the whole loop vectorizer the strided mem accesses pattern (no matter if the stride is constant, as it is right now, or runtime defined)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Besides the points @alexey-bataev mentioned, strided access might also need to be compared in terms of cost with interleaved access and gather/scatter access. If we encounter a situation where gather/scatter is converted to strided access, that's may workable, but converting interleaved groups to strided access might not be a simple task. Given all of this, I believe continuing with CM_Strided is a more scalable choice.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we have test cases showing the scenarios we would not be able to handle?
One thing to consider w.r.t. to adding new logic in the legacy cost model path is that it will lead to more/duplicated work during the migration away from making decisions there
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe the main challenge is interleaved access and strided access.
The interleaved accesses: https://godbolt.org/z/eTcjvxYGK
In the same code, gather/scatter vector pointers with a stride of 2 can be transformed into strided loads/stores in the future: https://godbolt.org/z/hfhx953a8
If we realize during VPlan transforms that strided access is more profitable than interleaved access, converting interleaved accesses into strided ones would require splitting them back into multiple strided accesses.
Moreover, if an interleaved access originally requires a scalar epilogue due to a tail gap,
void foo (int *A, int *B, int N) {
int i;
for (i = 0; i < N; ++i) {
A[i*3] = B[i*3] + 1;
A[i*3 + 1] = B[i*3 + 1] + 1;
// No access [i*3+2], this is gap in the end of interleave group, requires scalar epilogue.
}
}
such a requirement will disappear once it's converted into strided accesses. In that case, we would also need to modify the VPlan loop skeleton during the VPlan transform.
Given all that, the most we can do for now is something similar to what VPTransform::createInterleaveGroup
currently does—decide whether to generate strided accesses before the VPlan is built, using an enum InstWidening
to collect the necessary information, and then create a new VPTransform::createStridedAccesses
to generate VPWidenStridedLoad/Store recipes.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FWIW I've been experimenting locally with emitting strides as a VPlan transform. It's not a lot of code and it ends up more or less moving the stride detection from RISCVGatherScatterLowering.cpp into VPlan. It becomes easier though because instead of traversing through phis, you only really need to look for VPWidenGEPRecipes with a VPWidenIntOrFpInductionRecipe index instead (or VPScalarIVSteps). It looks something like this:
static void convertToStridedRecipes(VPlan &Plan) {
LLVMContext &Ctx = Plan.getCanonicalIV()->getScalarType()->getContext();
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
VPBuilder Builder(&R);
if (auto *Store = dyn_cast<VPWidenStoreRecipe>(&R)) {
if (Constant *StrideC = getStride(Store)) {
VPValue *Stride = Plan.getOrAddLiveIn(StrideC);
VPValue *Mask = Store->getMask();
if (!Mask)
Mask = Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx));
VPValue *EVL =
Builder.createScalarCast(Instruction::Trunc, &Plan.getVF(),
Type::getInt32Ty(Ctx), R.getDebugLoc());
auto *StridedStore = new VPWidenIntrinsicRecipe(
Intrinsic::experimental_vp_strided_store,
{Store->getStoredValue(), Store->getAddr(), Stride, Mask, EVL},
Type::getVoidTy(Ctx));
StridedStore->insertBefore(&R);
R.eraseFromParent();
}
}
if (auto *Load = dyn_cast<VPWidenLoadRecipe>(&R)) {
if (Constant *StrideC = getStride(Load)) {
VPValue *Stride = Plan.getOrAddLiveIn(StrideC);
VPValue *Mask = Load->getMask();
if (!Mask)
Mask = Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx));
VPValue *EVL =
Builder.createScalarCast(Instruction::Trunc, &Plan.getVF(),
Type::getInt32Ty(Ctx), R.getDebugLoc());
auto *StridedLoad = new VPWidenIntrinsicRecipe(
Intrinsic::experimental_vp_strided_load,
{Load->getAddr(), Stride, Mask, EVL},
Load->getIngredient().getType());
StridedLoad->insertBefore(&R);
R.getVPSingleValue()->replaceAllUsesWith(StridedLoad);
R.eraseFromParent();
}
}
}
}
It doesn't necessarily need to use VPWidenIntrinsicRecipe, the strided recipe added in this PR would also work.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks everyone for the feedback and suggestions!
However, I ran into an issue at the very first step of trying to remove CM_Strided :(
All reverse accesses with EVL tail folding are no longer vectorized after removing the function collectLoopUniforms
update:
@@ -3412,9 +3412,9 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF)
if (IsUniformMemOpUse(I))
return true;
- return (
- WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse ||
- WideningDecision == CM_Strided || WideningDecision == CM_Interleave);
+ return (WideningDecision == CM_Widen ||
+ WideningDecision == CM_Widen_Reverse ||
+ WideningDecision == CM_Interleave);
};
// Returns true if Ptr is the pointer operand of a memory access instruction
The root cause seems to be that the GEP is now considered as needing to be widened, which blocks the derived induction variable.
- The VPlan before we remove the update in
collectLoopUniforms
:
<x1> vector loop: {
vector.body:
EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
ir<%add.phi> = WIDEN-INDUCTION ir<%startval>, ir<-1>, vp<%0>
ir<%i> = WIDEN-INDUCTION ir<0>, ir<1>, vp<%0>
EMIT vp<%7> = WIDEN-CANONICAL-INDUCTION vp<%6>
EMIT vp<%8> = icmp ule vp<%7>, vp<%3>
CLONE ir<%add> = add ir<%add.phi>, ir<-1>
CLONE ir<%gepl> = getelementptr inbounds ir<%ptr>, ir<%add>
vp<%9> = vector-pointer ir<%gepl>
WIDEN ir<%tmp> = load vp<%9>, stride = ir<-4>, runtimeVF = vp<%0>
- The VPlan after we remove the update in
collectLoopUniforms
:
<x1> vector loop: {
vector.body:
EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
ir<%add.phi> = WIDEN-INDUCTION ir<%startval>, ir<-1>, vp<%0>
ir<%i> = WIDEN-INDUCTION ir<0>, ir<1>, vp<%0>
EMIT vp<%7> = WIDEN-CANONICAL-INDUCTION vp<%6>
EMIT vp<%8> = icmp ule vp<%7>, vp<%3>
WIDEN ir<%add> = add ir<%add.phi>, ir<-1>
WIDEN-GEP Inv[Var] ir<%gepl> = getelementptr inbounds ir<%ptr>, ir<%add>
vp<%9> = vector-pointer ir<%gepl>
WIDEN ir<%tmp> = load vp<%9>, stride = ir<-4>, runtimeVF = vp<%0>
This leads to the EVL VPlan bailing out due to the failure in converting the widened induction variable.
@fhahn Do we currently have a VPlan-based scalarization transformation? If not, do we need to install one before running legalizeAndOptimizeInductions
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@fhahn Do we currently have a VPlan-based scalarization transformation? If not, do we need to install one before running
legalizeAndOptimizeInductions
?
Hmm...answered it by myself. Looks like just need some fix in legalizeAndOptimizeInductions
. I will try to fix it tomorrow:)
// Try to narrow wide and replicating recipes to uniform recipes, based on
// VPlan analysis.
// TODO: Apply to all recipes in the future, to replace legacy uniformity
// analysis.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I recently put up #139150 to start in this direction. I have a few follow up patches that should get us close
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I recently put up #139150 to start in this direction. I have a few follow up patches that should get us close
This is a good direction, but #139150 doesn't yet address the issue we're facing.
So for now, I've made a small change in legalizeAndOptimizeInductions
. a55afd4
While it's still not very effective at converting widen induction variables into derived ones in more complex use-chain scenarios, like:
; CHECK-NEXT: ir<[[WIDEN_IV:%.+]]> = WIDEN-INDUCTION ir<%n>, ir<-1>, vp<[[VF]]>
; CHECK-NEXT: WIDEN ir<[[IDX:%.+]]> = add nsw ir<[[WIDEN_IV]]>, ir<-1>
; CHECK-NEXT: WIDEN-CAST ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> to i64
; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]>
; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]>
; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = vp<[[VF]]>
, it at least allows us to continue to try this path and ensures that the lit test cases of EVL folding remain vectorizable.
52dc1ca
to
a240003
Compare
Also cherry-pick the branch Mel-Chen:legalizeAndOptimizeInductions However, still not work well as collectLoopUniforms if the use-chain is too compilicated. :(
Still rely on CM_Strided to known legal and cost.
I’ve removed CM_Strided and moved the generation of strided access recipes entirely into the VPlan transformation phase. Please take a look, thanks. Here are some potential issues that are currently masked because we only convert reverse accesses at the moment, but they may need to be addressed in the future:
|
Initial patch to support strided memory accesses. Currently, only stride -1 is supported, with future extensions planned for additional strides and strided store.