Skip to content

[LV] Support strided load with a stride of -1 #128718

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 14 additions & 9 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4187,7 +4187,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
[](const auto *R) { return Instruction::Select; })
.Case<VPWidenStoreRecipe>(
[](const auto *R) { return Instruction::Store; })
.Case<VPWidenLoadRecipe>(
.Case<VPWidenLoadRecipe, VPWidenStridedLoadRecipe>(
[](const auto *R) { return Instruction::Load; })
.Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
[](const auto *R) { return Instruction::Call; })
Expand Down Expand Up @@ -4286,6 +4286,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPWidenPointerInductionSC:
case VPDef::VPReductionPHISC:
case VPDef::VPInterleaveSC:
case VPDef::VPWidenStridedLoadSC:
case VPDef::VPWidenLoadEVLSC:
case VPDef::VPWidenLoadSC:
case VPDef::VPWidenStoreEVLSC:
Expand Down Expand Up @@ -8253,10 +8254,10 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
VectorPtr = new VPVectorEndPointerRecipe(
Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc());
} else {
VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
GEP ? GEP->getNoWrapFlags()
: GEPNoWrapFlags::none(),
I->getDebugLoc());
VectorPtr = new VPVectorPointerRecipe(
Ptr, getLoadStoreType(I), /*Strided*/ false,
GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(),
I->getDebugLoc());
}
Builder.insert(VectorPtr);
Ptr = VectorPtr;
Expand Down Expand Up @@ -9405,16 +9406,20 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range,
// Adjust the recipes for any inloop reductions.
adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);

VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
CM.CostKind);
// Transform recipes to abstract recipes if it is legal and beneficial and
// clamp the range for better cost estimation.
// TODO: Enable following transform when the EVL-version of extended-reduction
// and mulacc-reduction are implemented.
if (!CM.foldTailWithEVL()) {
VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
CM.CostKind);
if (!CM.foldTailWithEVL())
VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
CostCtx, Range);
}

// Convert reverse memory recipes to strided access recipes if the strided
// access is legal and profitable.
VPlanTransforms::runPass(VPlanTransforms::convertToStridedAccesses, *Plan,
CostCtx, Range);

for (ElementCount VF : Range)
Plan->addVF(VF);
Expand Down
65 changes: 60 additions & 5 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPBranchOnMaskSC:
case VPRecipeBase::VPInterleaveSC:
case VPRecipeBase::VPIRInstructionSC:
case VPRecipeBase::VPWidenStridedLoadSC:
case VPRecipeBase::VPWidenLoadEVLSC:
case VPRecipeBase::VPWidenLoadSC:
case VPRecipeBase::VPWidenStoreEVLSC:
Expand Down Expand Up @@ -1717,6 +1718,8 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,

VP_CLASSOF_IMPL(VPDef::VPVectorEndPointerSC)

VPValue *getPtr() const { return getOperand(0); }

VPValue *getVFValue() { return getOperand(1); }
const VPValue *getVFValue() const { return getOperand(1); }

Expand Down Expand Up @@ -1756,16 +1759,21 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
};

/// A recipe to compute the pointers for widened memory accesses of IndexTy.
/// Supports both consecutive and reverse consecutive accesses.
/// TODO: Support non-unit strided accesses .
class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
public VPUnrollPartAccessor<1> {
Type *IndexedTy;

/// Indicate whether to compute the pointer for strided memory accesses.
bool Strided;

public:
VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, GEPNoWrapFlags GEPFlags,
DebugLoc DL)
VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, bool Strided,
GEPNoWrapFlags GEPFlags, DebugLoc DL)
: VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef<VPValue *>(Ptr),
GEPFlags, DL),
IndexedTy(IndexedTy) {}
IndexedTy(IndexedTy), Strided(Strided) {}

VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC)

Expand All @@ -1786,7 +1794,7 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
}

VPVectorPointerRecipe *clone() override {
return new VPVectorPointerRecipe(getOperand(0), IndexedTy,
return new VPVectorPointerRecipe(getOperand(0), IndexedTy, Strided,
getGEPNoWrapFlags(), getDebugLoc());
}

Expand Down Expand Up @@ -3003,7 +3011,8 @@ class VPWidenMemoryRecipe : public VPRecipeBase, public VPIRMetadata {
return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC ||
R->getVPDefID() == VPRecipeBase::VPWidenStoreSC ||
R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC ||
R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC;
R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC ||
R->getVPDefID() == VPRecipeBase::VPWidenStridedLoadSC;
}

static inline bool classof(const VPUser *U) {
Expand Down Expand Up @@ -3122,6 +3131,52 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
}
};

/// A recipe for strided load operations, using the base address, stride, and an
/// optional mask. This recipe will generate an vp.strided.load intrinsic call
/// to represent memory accesses with a fixed stride.
struct VPWidenStridedLoadRecipe final : public VPWidenMemoryRecipe,
public VPValue {
VPWidenStridedLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Stride,
VPValue *VF, VPValue *Mask,
const VPIRMetadata &Metadata, DebugLoc DL)
: VPWidenMemoryRecipe(
VPDef::VPWidenStridedLoadSC, Load, {Addr, Stride, VF},
/*Consecutive=*/false, /*Reverse=*/false, Metadata, DL),
VPValue(this, &Load) {
setMask(Mask);
}

VPWidenStridedLoadRecipe *clone() override {
return new VPWidenStridedLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
getStride(), getVF(), getMask(), *this,
getDebugLoc());
}

VP_CLASSOF_IMPL(VPDef::VPWidenStridedLoadSC);

/// Return the stride operand.
VPValue *getStride() const { return getOperand(1); }

/// Return the VF operand.
VPValue *getVF() const { return getOperand(2); }

/// Generate a strided load.
void execute(VPTransformState &State) override;

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
#endif

/// Returns true if the recipe only uses the first lane of operand \p Op.
bool onlyFirstLaneUsed(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return Op == getAddr() || Op == getStride() || Op == getVF();
}
};

/// A recipe for widening store operations, using the stored value, the address
/// to store to and an optional mask.
struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
Expand Down
6 changes: 4 additions & 2 deletions llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,10 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {
}

Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) {
assert((isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(R)) &&
"Store recipes should not define any values");
assert(
(isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe, VPWidenStridedLoadRecipe>(
R)) &&
"Store recipes should not define any values");
return cast<LoadInst>(&R->getIngredient())->getType();
}

Expand Down
67 changes: 62 additions & 5 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPWidenCastSC:
case VPWidenGEPSC:
case VPWidenIntOrFpInductionSC:
case VPWidenStridedLoadSC:
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
case VPWidenPHISC:
Expand All @@ -101,6 +102,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
switch (getVPDefID()) {
case VPInstructionSC:
return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
case VPWidenStridedLoadSC:
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
return true;
Expand Down Expand Up @@ -182,6 +184,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
}
case VPInterleaveSC:
return mayWriteToMemory();
case VPWidenStridedLoadSC:
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
case VPWidenStoreEVLSC:
Expand Down Expand Up @@ -2313,7 +2316,7 @@ void VPVectorEndPointerRecipe::execute(VPTransformState &State) {
ConstantInt::get(IndexTy, -(int64_t)CurrentPart), RunTimeVF);
// LastLane = 1 - RunTimeVF
Value *LastLane = Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
Value *Ptr = State.get(getOperand(0), VPLane(0));
Value *Ptr = State.get(getPtr(), VPLane(0));
Value *ResultPtr =
Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags());
ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "",
Expand Down Expand Up @@ -2341,8 +2344,13 @@ void VPVectorPointerRecipe::execute(VPTransformState &State) {
Value *Ptr = State.get(getOperand(0), VPLane(0));

Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
// TODO: Support non-unit-reverse strided accesses.
Value *Index =
Strided
? Builder.CreateMul(Increment, ConstantInt::getSigned(IndexTy, -1))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Strided means negative stride? Would be good to clarify in the comment for Strided..

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just for now. Later it is supposed to support positive (and <-1 negative) and runtime stride values

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should Strided eventually be some sort of std::optional<int>? Or is the plan to also allow runtime strided values? In which case I presume we need to model it as a VPValue operand.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the initial patch for strided memory accesses, currently only supporting cases where the stride is -1. A follow-up patch will focus on stride analysis to support more strided memory accesses.

However, I believe it would be beneficial to discuss and decide how VPlan should handle stride in this initial patch. My current plan is to extend the existing VPWidenLoadRecipe/VPWidenStoreRecipe and VPVectorPointerRecipe by explicitly passing the stride into the recipe. For example:

VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask, 
                  VPValue *Stride,  
                  bool Consecutive, bool Reverse, DebugLoc DL)

A memory access will only be considered strided when Stride is explicitly provided (i.e., not nullptr).

@fhahn @ayalz What do you think about this approach?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ping @fhahn @ayalz. Do you think this is a good approach? Or any suggestion?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we should add another 'mode' to WidenLoad/StoreRecipe to make it even more complicated.

It would probably be good to have a separate recipe.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we want to support it for EVL tail folding further down the line would we also need to add an EVL + Strided load recipe?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we should add another 'mode' to WidenLoad/StoreRecipe to make it even more complicated.

It would probably be good to have a separate recipe.

@fhahn I see. Let's first try adding VPStridedLoadRecipe for strided loads. What about VPVectorPointerRecipe? Can we modify it to:

VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, VPValue *Stride,  
                      GEPNoWrapFlags GEPFlags, DebugLoc DL) 

Or should we introduce a new recipe instead?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we want to support it for EVL tail folding further down the line would we also need to add an EVL + Strided load recipe?

@lukel97 I think we can pass VF into the new recipe, and replace VF with EVL, like VPVectorEndPointerRecipe did.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Mel-Chen yes, that's a good idea. Since it will generate a vp intrinsic either way.

: Increment;
Value *ResultPtr =
Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags());
Builder.CreateGEP(IndexedTy, Ptr, Index, "", getGEPNoWrapFlags());

State.set(this, ResultPtr, /*IsScalar*/ true);
}
Expand Down Expand Up @@ -2915,9 +2923,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
->getAddressSpace();
unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)
? Instruction::Load
: Instruction::Store;
unsigned Opcode =
isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe, VPWidenStridedLoadRecipe>(
this)
? Instruction::Load
: Instruction::Store;

if (!Consecutive) {
// TODO: Using the original IR may not be accurate.
Expand All @@ -2926,6 +2936,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
assert(!Reverse &&
"Inconsecutive memory access should not have the order.");

if (isa<VPWidenStridedLoadRecipe>(this))
return Ctx.TTI.getStridedMemoryOpCost(
Opcode, Ty, Ptr, IsMasked, Alignment, Ctx.CostKind, &Ingredient);

return Ctx.TTI.getAddressComputationCost(Ty) +
Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment,
Ctx.CostKind, &Ingredient);
Expand Down Expand Up @@ -3078,6 +3093,48 @@ void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif

void VPWidenStridedLoadRecipe::execute(VPTransformState &State) {
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
const Align Alignment = getLoadStoreAlignment(&Ingredient);

auto &Builder = State.Builder;
State.setDebugLocFrom(getDebugLoc());
Value *Addr = State.get(getAddr(), /*IsScalar*/ true);
Value *Stride = State.get(getStride(), /*IsScalar*/ true);
Value *Mask = nullptr;
if (VPValue *VPMask = getMask())
Mask = State.get(VPMask);
else
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
Value *RunTimeVF = Builder.CreateZExtOrTrunc(State.get(getVF(), VPLane(0)),
Builder.getInt32Ty());

auto *PtrTy = Addr->getType();
auto *StrideTy = Stride->getType();
CallInst *NewLI = Builder.CreateIntrinsic(
Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy},
{Addr, Stride, Mask, RunTimeVF}, nullptr, "wide.strided.load");
NewLI->addParamAttr(
0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
applyMetadata(*NewLI);
State.set(this, NewLI);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenStridedLoadRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN ";
printAsOperand(O, SlotTracker);
O << " = load ";
getAddr()->printAsOperand(O, SlotTracker);
O << ", stride = ";
getStride()->printAsOperand(O, SlotTracker);
O << ", runtimeVF = ";
getVF()->printAsOperand(O, SlotTracker);
}
#endif

void VPWidenStoreRecipe::execute(VPTransformState &State) {
VPValue *StoredVPValue = getStoredValue();
bool CreateScatter = !isConsecutive();
Expand Down
Loading