From 99fe837cb07103f82b514dedb97508fbc7e9b4e7 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Wed, 19 Feb 2025 01:37:53 -0800 Subject: [PATCH 1/7] Init: New Recipe VPWidenStridedLoadRecipe --- .../Transforms/Vectorize/LoopVectorize.cpp | 112 +++++++- llvm/lib/Transforms/Vectorize/VPlan.h | 67 ++++- .../Transforms/Vectorize/VPlanAnalysis.cpp | 6 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 64 ++++- .../Transforms/Vectorize/VPlanTransforms.cpp | 9 +- llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 + .../Transforms/Vectorize/VPlanVerifier.cpp | 3 +- .../RISCV/riscv-vector-reverse-output.ll | 264 ++++++++---------- .../RISCV/riscv-vector-reverse.ll | 92 +++--- ...-force-tail-with-evl-reverse-load-store.ll | 91 +++--- 10 files changed, 453 insertions(+), 256 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b2d7c44761f6d..a12de7aed46ac 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1095,6 +1095,7 @@ class LoopVectorizationCostModel { CM_Widen_Reverse, // For consecutive accesses with stride -1. CM_Interleave, CM_GatherScatter, + CM_Strided, CM_Scalarize, CM_VectorCall, CM_IntrinsicCall @@ -1328,6 +1329,20 @@ class LoopVectorizationCostModel { return InterleaveInfo.getInterleaveGroup(Instr); } + /// Returns true if \p I is a memory instruction with strided memory access + /// that can be vectorized. + bool stridedAccessCanBeWidened(Instruction *I, ElementCount VF) const; + + /// Get the stride of the strided memory access instruction \p Instr. Return 0 + /// if the instruction \p Instr is not considered for vectorization as a + /// strided memory access. + int64_t getStride(Instruction *Instr) const { + auto It = StrideInfo.find(Instr); + if (It != StrideInfo.end()) + return It->second; + return 0; + } + /// Returns true if we're required to use a scalar epilogue for at least /// the final iteration of the original loop. bool requiresScalarEpilogue(bool IsVectorizing) const { @@ -1582,6 +1597,10 @@ class LoopVectorizationCostModel { /// element) InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); + /// The cost computation for strided load/store instruction. + InstructionCost getStridedLoadStoreCost(Instruction *I, + ElementCount VF) const; + /// Estimate the overhead of scalarizing an instruction. This is a /// convenience wrapper for the type-based getScalarizationOverhead API. InstructionCost getScalarizationOverhead(Instruction *I, @@ -1721,6 +1740,9 @@ class LoopVectorizationCostModel { Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); } + /// The mapping of memory access instructions to their stride values. + DenseMap StrideInfo; + public: /// The loop that we evaluate. Loop *TheLoop; @@ -3278,6 +3300,31 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( return true; } +bool LoopVectorizationCostModel::stridedAccessCanBeWidened( + Instruction *I, ElementCount VF) const { + // Get and ensure we have a valid memory instruction. + assert((isa(I)) && "Invalid memory instruction"); + + // Only support strided access for vector VF. + if (!VF.isVector()) + return false; + + // FIXME: Remove this check for StoreInst after strided store is supported. + if (isa(I)) + return false; + + [[maybe_unused]] auto *Ptr = getLoadStorePointerOperand(I); + auto *ScalarTy = getLoadStoreType(I); + // TODO: Support non-unit-reverse strided accesses. Add stride analysis here + // to ensure that the accessed addresses are evenly spaced apart by a fixed + // stride. + assert(Legal->isConsecutivePtr(ScalarTy, Ptr) == -1 && + "Only supports strided accesses with a stride of -1"); + + const Align Alignment = getLoadStoreAlignment(I); + return TTI.isLegalStridedLoadStore(toVectorTy(ScalarTy, VF), Alignment); +} + void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // We should not collect Uniforms more than once per VF. Right now, // this function is called from collectUniformsAndScalars(), which @@ -3368,9 +3415,9 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { if (IsUniformMemOpUse(I)) return true; - return (WideningDecision == CM_Widen || - WideningDecision == CM_Widen_Reverse || - WideningDecision == CM_Interleave); + return ( + WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse || + WideningDecision == CM_Strided || WideningDecision == CM_Interleave); }; // Returns true if Ptr is the pointer operand of a memory access instruction @@ -4187,7 +4234,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks( [](const auto *R) { return Instruction::Select; }) .Case( [](const auto *R) { return Instruction::Store; }) - .Case( + .Case( [](const auto *R) { return Instruction::Load; }) .Case( [](const auto *R) { return Instruction::Call; }) @@ -4286,6 +4333,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, case VPDef::VPWidenPointerInductionSC: case VPDef::VPReductionPHISC: case VPDef::VPInterleaveSC: + case VPDef::VPWidenStridedLoadSC: case VPDef::VPWidenLoadEVLSC: case VPDef::VPWidenLoadSC: case VPDef::VPWidenStoreEVLSC: @@ -5680,6 +5728,19 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, return Cost; } +InstructionCost +LoopVectorizationCostModel::getStridedLoadStoreCost(Instruction *I, + ElementCount VF) const { + Type *ValTy = getLoadStoreType(I); + auto *VectorTy = cast(toVectorTy(ValTy, VF)); + const Align Alignment = getLoadStoreAlignment(I); + const Value *Ptr = getLoadStorePointerOperand(I); + + return TTI.getStridedMemoryOpCost(I->getOpcode(), VectorTy, Ptr, + Legal->isMaskRequired(I), Alignment, + CostKind, I); +} + std::optional LoopVectorizationCostModel::getReductionPatternCost(Instruction *I, ElementCount VF, @@ -5999,6 +6060,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { "Expected consecutive stride."); InstWidening Decision = ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; + // Consider using strided load/store for consecutive reverse accesses to + // achieve more efficient memory operations. + if (ConsecutiveStride == -1 && stridedAccessCanBeWidened(&I, VF)) { + const InstructionCost StridedLoadStoreCost = + getStridedLoadStoreCost(&I, VF); + if (StridedLoadStoreCost < Cost) { + Decision = CM_Strided; + Cost = StridedLoadStoreCost; + StrideInfo[&I] = ConsecutiveStride; + } + } setWideningDecision(&I, VF, Decision, Cost); continue; } @@ -6650,6 +6722,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, return TTI::CastContextHint::Normal; switch (getWideningDecision(I, VF)) { + // TODO: New CastContextHint for strided accesses. + case LoopVectorizationCostModel::CM_Strided: case LoopVectorizationCostModel::CM_GatherScatter: return TTI::CastContextHint::GatherScatter; case LoopVectorizationCostModel::CM_Interleave: @@ -8233,16 +8307,27 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, // reverse consecutive. LoopVectorizationCostModel::InstWidening Decision = CM.getWideningDecision(I, Range.Start); + + auto SameWiden = [&](ElementCount VF) -> bool { + return Decision == CM.getWideningDecision(I, VF); + }; + bool ContainsWidenVF = + LoopVectorizationPlanner::getDecisionAndClampRange(SameWiden, Range); + assert(ContainsWidenVF && + "At least widen the memory accesses by the Start VF."); + bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; bool Consecutive = Reverse || Decision == LoopVectorizationCostModel::CM_Widen; + bool Strided = Decision == LoopVectorizationCostModel::CM_Strided; VPValue *Ptr = isa(I) ? Operands[0] : Operands[1]; - if (Consecutive) { + if (Consecutive || Strided) { auto *GEP = dyn_cast( Ptr->getUnderlyingValue()->stripPointerCasts()); VPSingleDefRecipe *VectorPtr; if (Reverse) { + assert(!Strided && "Reverse and Strided are mutually exclusive."); // When folding the tail, we may compute an address that we don't in the // original scalar loop and it may not be inbounds. Drop Inbounds in that // case. @@ -8253,7 +8338,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, VectorPtr = new VPVectorEndPointerRecipe( Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc()); } else { - VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), + VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), Strided, GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(), I->getDebugLoc()); @@ -8261,9 +8346,22 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, Builder.insert(VectorPtr); Ptr = VectorPtr; } - if (LoadInst *Load = dyn_cast(I)) + if (LoadInst *Load = dyn_cast(I)) { + if (Strided) { + const DataLayout &DL = Load->getDataLayout(); + auto *StrideTy = DL.getIndexType(Load->getPointerOperand()->getType()); + int64_t Stride = CM.getStride(Load); + assert(Stride == -1 && + "Only stride memory access with a stride of -1 is supported."); + VPValue *StrideVPV = Plan.getOrAddLiveIn(ConstantInt::get( + StrideTy, Stride * DL.getTypeAllocSize(getLoadStoreType(Load)))); + return new VPWidenStridedLoadRecipe(*Load, Ptr, StrideVPV, &Plan.getVF(), + Mask, VPIRMetadata(*Load, LVer), + I->getDebugLoc()); + } return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, VPIRMetadata(*Load, LVer), I->getDebugLoc()); + } StoreInst *Store = cast(I); return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive, diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index e634de1e17c69..a579191e5c4a6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -544,6 +544,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPBranchOnMaskSC: case VPRecipeBase::VPInterleaveSC: case VPRecipeBase::VPIRInstructionSC: + case VPRecipeBase::VPWidenStridedLoadSC: case VPRecipeBase::VPWidenLoadEVLSC: case VPRecipeBase::VPWidenLoadSC: case VPRecipeBase::VPWidenStoreEVLSC: @@ -1756,16 +1757,21 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags, }; /// A recipe to compute the pointers for widened memory accesses of IndexTy. +/// Supports both consecutive and reverse consecutive accesses. +/// TODO: Support non-unit strided accesses . class VPVectorPointerRecipe : public VPRecipeWithIRFlags, public VPUnrollPartAccessor<1> { Type *IndexedTy; + /// Indicate whether to compute the pointer for strided memory accesses. + bool Strided; + public: - VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, GEPNoWrapFlags GEPFlags, - DebugLoc DL) + VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, bool Strided, + GEPNoWrapFlags GEPFlags, DebugLoc DL) : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef(Ptr), GEPFlags, DL), - IndexedTy(IndexedTy) {} + IndexedTy(IndexedTy), Strided(Strided) {} VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC) @@ -1786,7 +1792,7 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags, } VPVectorPointerRecipe *clone() override { - return new VPVectorPointerRecipe(getOperand(0), IndexedTy, + return new VPVectorPointerRecipe(getOperand(0), IndexedTy, Strided, getGEPNoWrapFlags(), getDebugLoc()); } @@ -3003,7 +3009,8 @@ class VPWidenMemoryRecipe : public VPRecipeBase, public VPIRMetadata { return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC || R->getVPDefID() == VPRecipeBase::VPWidenStoreSC || R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC || - R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC; + R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC || + R->getVPDefID() == VPRecipeBase::VPWidenStridedLoadSC; } static inline bool classof(const VPUser *U) { @@ -3122,6 +3129,56 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { } }; +/// A recipe for strided load operations, using the base address, stride, and an +/// optional mask. This recipe will generate an vp.strided.load intrinsic call +/// to represent memory accesses with a fixed stride. +struct VPWidenStridedLoadRecipe final : public VPWidenMemoryRecipe, + public VPValue { + VPWidenStridedLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Stride, + VPValue *VF, VPValue *Mask, + const VPIRMetadata &Metadata, DebugLoc DL) + : VPWidenMemoryRecipe( + VPDef::VPWidenStridedLoadSC, Load, {Addr, Stride, VF}, + /*Consecutive=*/false, /*Reverse=*/false, Metadata, DL), + VPValue(this, &Load) { + setMask(Mask); + } + + VPWidenStridedLoadRecipe *clone() override { + return new VPWidenStridedLoadRecipe(cast(Ingredient), getAddr(), + getStride(), getVF(), getMask(), *this, + getDebugLoc()); + } + + VP_CLASSOF_IMPL(VPDef::VPWidenStridedLoadSC); + + /// Return the stride operand. + VPValue *getStride() const { return getOperand(1); } + + /// Return the VF operand. + VPValue *getVF() const { return getOperand(2); } + + /// Generate a strided load. + void execute(VPTransformState &State) override; + + /// Return the cost of this VPWidenStridedLoadRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + /// Returns true if the recipe only uses the first lane of operand \p Op. + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return Op == getAddr() || Op == getStride() || Op == getVF(); + } +}; + /// A recipe for widening store operations, using the stored value, the address /// to store to and an optional mask. struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe { diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index ac0f30cb4693c..d3e7455c4f79e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -180,8 +180,10 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) { } Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) { - assert((isa(R)) && - "Store recipes should not define any values"); + assert( + (isa( + R)) && + "Store recipes should not define any values"); return cast(&R->getIngredient())->getType(); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 14ed40f16683a..724d1d2f68aac 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -80,6 +80,7 @@ bool VPRecipeBase::mayWriteToMemory() const { case VPWidenCastSC: case VPWidenGEPSC: case VPWidenIntOrFpInductionSC: + case VPWidenStridedLoadSC: case VPWidenLoadEVLSC: case VPWidenLoadSC: case VPWidenPHISC: @@ -101,6 +102,7 @@ bool VPRecipeBase::mayReadFromMemory() const { switch (getVPDefID()) { case VPInstructionSC: return cast(this)->opcodeMayReadOrWriteFromMemory(); + case VPWidenStridedLoadSC: case VPWidenLoadEVLSC: case VPWidenLoadSC: return true; @@ -182,6 +184,7 @@ bool VPRecipeBase::mayHaveSideEffects() const { } case VPInterleaveSC: return mayWriteToMemory(); + case VPWidenStridedLoadSC: case VPWidenLoadEVLSC: case VPWidenLoadSC: case VPWidenStoreEVLSC: @@ -2341,8 +2344,13 @@ void VPVectorPointerRecipe::execute(VPTransformState &State) { Value *Ptr = State.get(getOperand(0), VPLane(0)); Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart); + // TODO: Support non-unit-reverse strided accesses. + Value *Index = + Strided + ? Builder.CreateMul(Increment, ConstantInt::getSigned(IndexTy, -1)) + : Increment; Value *ResultPtr = - Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags()); + Builder.CreateGEP(IndexedTy, Ptr, Index, "", getGEPNoWrapFlags()); State.set(this, ResultPtr, /*IsScalar*/ true); } @@ -3078,6 +3086,60 @@ void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, } #endif +void VPWidenStridedLoadRecipe::execute(VPTransformState &State) { + Type *ScalarDataTy = getLoadStoreType(&Ingredient); + auto *DataTy = VectorType::get(ScalarDataTy, State.VF); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + + auto &Builder = State.Builder; + State.setDebugLocFrom(getDebugLoc()); + Value *Addr = State.get(getAddr(), /*IsScalar*/ true); + Value *Stride = State.get(getStride(), /*IsScalar*/ true); + Value *Mask = nullptr; + if (VPValue *VPMask = getMask()) + Mask = State.get(VPMask); + else + Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); + Value *RunTimeVF = Builder.CreateZExtOrTrunc(State.get(getVF(), VPLane(0)), + Builder.getInt32Ty()); + + auto *PtrTy = Addr->getType(); + auto *StrideTy = Stride->getType(); + CallInst *NewLI = Builder.CreateIntrinsic( + Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy}, + {Addr, Stride, Mask, RunTimeVF}, nullptr, "wide.strided.load"); + NewLI->addParamAttr( + 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); + applyMetadata(*NewLI); + State.set(this, NewLI); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPWidenStridedLoadRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "WIDEN "; + printAsOperand(O, SlotTracker); + O << " = load "; + getAddr()->printAsOperand(O, SlotTracker); + O << ", stride = "; + getStride()->printAsOperand(O, SlotTracker); + O << ", runtimeVF = "; + getVF()->printAsOperand(O, SlotTracker); +} +#endif + +InstructionCost +VPWidenStridedLoadRecipe::computeCost(ElementCount VF, + VPCostContext &Ctx) const { + Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + const Value *Ptr = getLoadStorePointerOperand(&Ingredient); + + return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty, Ptr, + IsMasked, Alignment, Ctx.CostKind, + &Ingredient); +} + void VPWidenStoreRecipe::execute(VPTransformState &State) { VPValue *StoredVPValue = getStoredValue(); bool CreateScatter = !isConsecutive(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index bc007589e1539..7958f0e902e1a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2117,6 +2117,12 @@ static VPRecipeBase *createEVLRecipe(VPValue *HeaderMask, VPValue *NewMask = GetNewMask(L->getMask()); return new VPWidenLoadEVLRecipe(*L, EVL, NewMask); }) + .Case([&](VPWidenStridedLoadRecipe *L) { + VPValue *NewMask = GetNewMask(L->getMask()); + return new VPWidenStridedLoadRecipe( + *cast(&L->getIngredient()), L->getAddr(), L->getStride(), + &EVL, NewMask, *L, L->getDebugLoc()); + }) .Case([&](VPWidenStoreRecipe *S) { VPValue *NewMask = GetNewMask(S->getMask()); return new VPWidenStoreEVLRecipe(*S, EVL, NewMask); @@ -2215,7 +2221,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { NumDefVal <= 1 && "Only supports recipes with a single definition or without users."); EVLRecipe->insertBefore(CurRecipe); - if (isa(EVLRecipe)) { + if (isa(EVLRecipe)) { VPValue *CurVPV = CurRecipe->getVPSingleValue(); CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue()); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 64065edd315f9..b78a406face90 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -344,6 +344,7 @@ class VPDef { VPPartialReductionSC, VPReplicateSC, VPScalarIVStepsSC, + VPWidenStridedLoadSC, VPVectorPointerSC, VPVectorEndPointerSC, VPWidenCallSC, diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 68b35d42e8674..e6294bdf7eb21 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -158,7 +158,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { .Case([&](const VPWidenIntrinsicRecipe *S) { return VerifyEVLUse(*S, S->getNumOperands() - 1); }) - .Case( + .Case( [&](const VPRecipeBase *S) { return VerifyEVLUse(*S, 2); }) .Case( [&](const VPRecipeBase *R) { return VerifyEVLUse(*R, 1); }) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll index a28673cf8e552..94ccbd91bdab1 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll @@ -39,23 +39,20 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] ; RV64-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 ; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]] -; RV64-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP5]] -; RV64-NEXT: [[TMP11:%.*]] = sub i64 1, [[TMP5]] -; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP10]] -; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP11]] -; RV64-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP13]], align 4 -; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD]]) -; RV64-NEXT: [[TMP14:%.*]] = add [[REVERSE]], splat (i32 1) -; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] -; RV64-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP5]] -; RV64-NEXT: [[TMP17:%.*]] = sub i64 1, [[TMP5]] -; RV64-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[TMP16]] -; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[TMP17]] -; RV64-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP14]]) -; RV64-NEXT: store [[REVERSE1]], ptr [[TMP19]], align 4 +; RV64-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0 +; RV64-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP5]] to i32 +; RV64-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP10]], i64 -4, splat (i1 true), i32 [[TMP11]]) +; RV64-NEXT: [[TMP12:%.*]] = add [[WIDE_STRIDED_LOAD]], splat (i32 1) +; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] +; RV64-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP5]] +; RV64-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP5]] +; RV64-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP14]] +; RV64-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[TMP15]] +; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP12]]) +; RV64-NEXT: store [[REVERSE]], ptr [[TMP17]], align 4 ; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; RV64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; RV64-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV64-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; RV64: [[MIDDLE_BLOCK]]: ; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] ; RV64-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -66,8 +63,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV64-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; RV64-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 ; RV64-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]] -; RV64-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4 -; RV64-NEXT: [[ADD:%.*]] = add i32 [[TMP21]], 1 +; RV64-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4 +; RV64-NEXT: [[ADD:%.*]] = add i32 [[TMP19]], 1 ; RV64-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]] ; RV64-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4 ; RV64-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 @@ -96,25 +93,21 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] ; RV32-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 ; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]] -; RV32-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32 -; RV32-NEXT: [[TMP11:%.*]] = mul i32 0, [[TMP10]] -; RV32-NEXT: [[TMP12:%.*]] = sub i32 1, [[TMP10]] -; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 [[TMP11]] -; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 [[TMP12]] -; RV32-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 4 -; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD]]) -; RV32-NEXT: [[TMP15:%.*]] = add [[REVERSE]], splat (i32 1) -; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] -; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32 -; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]] -; RV32-NEXT: [[TMP19:%.*]] = sub i32 1, [[TMP17]] -; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 [[TMP18]] -; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 [[TMP19]] -; RV32-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP15]]) -; RV32-NEXT: store [[REVERSE1]], ptr [[TMP21]], align 4 +; RV32-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0 +; RV32-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP5]] to i32 +; RV32-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr align 4 [[TMP10]], i32 -4, splat (i1 true), i32 [[TMP11]]) +; RV32-NEXT: [[TMP12:%.*]] = add [[WIDE_STRIDED_LOAD]], splat (i32 1) +; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] +; RV32-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP5]] to i32 +; RV32-NEXT: [[TMP15:%.*]] = mul i32 0, [[TMP14]] +; RV32-NEXT: [[TMP16:%.*]] = sub i32 1, [[TMP14]] +; RV32-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 [[TMP15]] +; RV32-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 [[TMP16]] +; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP12]]) +; RV32-NEXT: store [[REVERSE]], ptr [[TMP18]], align 4 ; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; RV32-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV32-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; RV32-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV32-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; RV32: [[MIDDLE_BLOCK]]: ; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] ; RV32-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -125,8 +118,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV32-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; RV32-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 ; RV32-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]] -; RV32-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4 -; RV32-NEXT: [[ADD:%.*]] = add i32 [[TMP23]], 1 +; RV32-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4 +; RV32-NEXT: [[ADD:%.*]] = add i32 [[TMP20]], 1 ; RV32-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]] ; RV32-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4 ; RV32-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 @@ -154,38 +147,35 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV64-UF2: [[VECTOR_BODY]]: ; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] -; RV64-UF2-NEXT: [[TMP9:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 -; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP9]] -; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 0, [[TMP5]] -; RV64-UF2-NEXT: [[TMP12:%.*]] = sub i64 1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP11]] -; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP12]] -; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP16:%.*]] = sub i64 1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP15]] -; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP16]] -; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 4 -; RV64-UF2-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD]]) -; RV64-UF2-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP18]], align 4 -; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD1]]) -; RV64-UF2-NEXT: [[TMP19:%.*]] = add [[REVERSE]], splat (i32 1) -; RV64-UF2-NEXT: [[TMP20:%.*]] = add [[REVERSE2]], splat (i32 1) -; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]] -; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP5]] -; RV64-UF2-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP22]] -; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]] -; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP27:%.*]] = sub i64 1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP26]] -; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i64 [[TMP27]] -; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP19]]) -; RV64-UF2-NEXT: store [[REVERSE3]], ptr [[TMP25]], align 4 -; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP20]]) -; RV64-UF2-NEXT: store [[REVERSE4]], ptr [[TMP29]], align 4 +; RV64-UF2-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 +; RV64-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]] +; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0 +; RV64-UF2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4 +; RV64-UF2-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], -1 +; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP13]] +; RV64-UF2-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP5]] to i32 +; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP10]], i64 -4, splat (i1 true), i32 [[TMP15]]) +; RV64-UF2-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP5]] to i32 +; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD1:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP14]], i64 -4, splat (i1 true), i32 [[TMP16]]) +; RV64-UF2-NEXT: [[TMP17:%.*]] = add [[WIDE_STRIDED_LOAD]], splat (i32 1) +; RV64-UF2-NEXT: [[TMP18:%.*]] = add [[WIDE_STRIDED_LOAD1]], splat (i32 1) +; RV64-UF2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] +; RV64-UF2-NEXT: [[TMP20:%.*]] = mul i64 0, [[TMP5]] +; RV64-UF2-NEXT: [[TMP21:%.*]] = sub i64 1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP20]] +; RV64-UF2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i64 [[TMP21]] +; RV64-UF2-NEXT: [[TMP24:%.*]] = mul i64 -1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP25:%.*]] = sub i64 1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP24]] +; RV64-UF2-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i64 [[TMP25]] +; RV64-UF2-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP17]]) +; RV64-UF2-NEXT: store [[REVERSE]], ptr [[TMP23]], align 4 +; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP18]]) +; RV64-UF2-NEXT: store [[REVERSE2]], ptr [[TMP27]], align 4 ; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; RV64-UF2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-UF2-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; RV64-UF2-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV64-UF2-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; RV64-UF2: [[MIDDLE_BLOCK]]: ; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] ; RV64-UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -196,8 +186,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV64-UF2-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; RV64-UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 ; RV64-UF2-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]] -; RV64-UF2-NEXT: [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4 -; RV64-UF2-NEXT: [[ADD:%.*]] = add i32 [[TMP31]], 1 +; RV64-UF2-NEXT: [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4 +; RV64-UF2-NEXT: [[ADD:%.*]] = add i32 [[TMP29]], 1 ; RV64-UF2-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]] ; RV64-UF2-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4 ; RV64-UF2-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 @@ -245,23 +235,20 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { ; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] ; RV64-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 ; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]] -; RV64-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP5]] -; RV64-NEXT: [[TMP11:%.*]] = sub i64 1, [[TMP5]] -; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP10]] -; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP11]] -; RV64-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP13]], align 4 -; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD]]) -; RV64-NEXT: [[TMP14:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) -; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] -; RV64-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP5]] -; RV64-NEXT: [[TMP17:%.*]] = sub i64 1, [[TMP5]] -; RV64-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i64 [[TMP16]] -; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i64 [[TMP17]] -; RV64-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP14]]) -; RV64-NEXT: store [[REVERSE1]], ptr [[TMP19]], align 4 +; RV64-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0 +; RV64-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP5]] to i32 +; RV64-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP10]], i64 -4, splat (i1 true), i32 [[TMP11]]) +; RV64-NEXT: [[TMP12:%.*]] = fadd [[WIDE_STRIDED_LOAD]], splat (float 1.000000e+00) +; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] +; RV64-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP5]] +; RV64-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP5]] +; RV64-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[TMP14]] +; RV64-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP15]] +; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP12]]) +; RV64-NEXT: store [[REVERSE]], ptr [[TMP17]], align 4 ; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; RV64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; RV64-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV64-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; RV64: [[MIDDLE_BLOCK]]: ; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] ; RV64-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -272,8 +259,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { ; RV64-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; RV64-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 ; RV64-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]] -; RV64-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4 -; RV64-NEXT: [[FADD:%.*]] = fadd float [[TMP21]], 1.000000e+00 +; RV64-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4 +; RV64-NEXT: [[FADD:%.*]] = fadd float [[TMP19]], 1.000000e+00 ; RV64-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]] ; RV64-NEXT: store float [[FADD]], ptr [[ARRAYIDX_A]], align 4 ; RV64-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 @@ -302,25 +289,21 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { ; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] ; RV32-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 ; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]] -; RV32-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32 -; RV32-NEXT: [[TMP11:%.*]] = mul i32 0, [[TMP10]] -; RV32-NEXT: [[TMP12:%.*]] = sub i32 1, [[TMP10]] -; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 [[TMP11]] -; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP12]] -; RV32-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 4 -; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD]]) -; RV32-NEXT: [[TMP15:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) -; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] -; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32 -; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]] -; RV32-NEXT: [[TMP19:%.*]] = sub i32 1, [[TMP17]] -; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 [[TMP18]] -; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP19]] -; RV32-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP15]]) -; RV32-NEXT: store [[REVERSE1]], ptr [[TMP21]], align 4 +; RV32-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0 +; RV32-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP5]] to i32 +; RV32-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4f32.p0.i32(ptr align 4 [[TMP10]], i32 -4, splat (i1 true), i32 [[TMP11]]) +; RV32-NEXT: [[TMP12:%.*]] = fadd [[WIDE_STRIDED_LOAD]], splat (float 1.000000e+00) +; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] +; RV32-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP5]] to i32 +; RV32-NEXT: [[TMP15:%.*]] = mul i32 0, [[TMP14]] +; RV32-NEXT: [[TMP16:%.*]] = sub i32 1, [[TMP14]] +; RV32-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP15]] +; RV32-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 [[TMP16]] +; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP12]]) +; RV32-NEXT: store [[REVERSE]], ptr [[TMP18]], align 4 ; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; RV32-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV32-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; RV32-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV32-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; RV32: [[MIDDLE_BLOCK]]: ; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] ; RV32-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -331,8 +314,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { ; RV32-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; RV32-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 ; RV32-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]] -; RV32-NEXT: [[TMP23:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4 -; RV32-NEXT: [[FADD:%.*]] = fadd float [[TMP23]], 1.000000e+00 +; RV32-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4 +; RV32-NEXT: [[FADD:%.*]] = fadd float [[TMP20]], 1.000000e+00 ; RV32-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]] ; RV32-NEXT: store float [[FADD]], ptr [[ARRAYIDX_A]], align 4 ; RV32-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 @@ -360,38 +343,35 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { ; RV64-UF2: [[VECTOR_BODY]]: ; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] -; RV64-UF2-NEXT: [[TMP9:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 -; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]] -; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 0, [[TMP5]] -; RV64-UF2-NEXT: [[TMP12:%.*]] = sub i64 1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP11]] -; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[TMP12]] -; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP16:%.*]] = sub i64 1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP15]] -; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP16]] -; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 4 -; RV64-UF2-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD]]) -; RV64-UF2-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP18]], align 4 -; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD1]]) -; RV64-UF2-NEXT: [[TMP19:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) -; RV64-UF2-NEXT: [[TMP20:%.*]] = fadd [[REVERSE2]], splat (float 1.000000e+00) -; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] -; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP5]] -; RV64-UF2-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP22]] -; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i64 [[TMP23]] -; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP27:%.*]] = sub i64 1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP26]] -; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[TMP27]] -; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP19]]) -; RV64-UF2-NEXT: store [[REVERSE3]], ptr [[TMP25]], align 4 -; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP20]]) -; RV64-UF2-NEXT: store [[REVERSE4]], ptr [[TMP29]], align 4 +; RV64-UF2-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 +; RV64-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]] +; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0 +; RV64-UF2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4 +; RV64-UF2-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], -1 +; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP13]] +; RV64-UF2-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP5]] to i32 +; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP10]], i64 -4, splat (i1 true), i32 [[TMP15]]) +; RV64-UF2-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP5]] to i32 +; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD1:%.*]] = call @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP14]], i64 -4, splat (i1 true), i32 [[TMP16]]) +; RV64-UF2-NEXT: [[TMP17:%.*]] = fadd [[WIDE_STRIDED_LOAD]], splat (float 1.000000e+00) +; RV64-UF2-NEXT: [[TMP18:%.*]] = fadd [[WIDE_STRIDED_LOAD1]], splat (float 1.000000e+00) +; RV64-UF2-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] +; RV64-UF2-NEXT: [[TMP20:%.*]] = mul i64 0, [[TMP5]] +; RV64-UF2-NEXT: [[TMP21:%.*]] = sub i64 1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[TMP20]] +; RV64-UF2-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP21]] +; RV64-UF2-NEXT: [[TMP24:%.*]] = mul i64 -1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP25:%.*]] = sub i64 1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[TMP24]] +; RV64-UF2-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[TMP25]] +; RV64-UF2-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP17]]) +; RV64-UF2-NEXT: store [[REVERSE]], ptr [[TMP23]], align 4 +; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP18]]) +; RV64-UF2-NEXT: store [[REVERSE2]], ptr [[TMP27]], align 4 ; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; RV64-UF2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-UF2-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; RV64-UF2-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV64-UF2-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; RV64-UF2: [[MIDDLE_BLOCK]]: ; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] ; RV64-UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -402,8 +382,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { ; RV64-UF2-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; RV64-UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 ; RV64-UF2-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]] -; RV64-UF2-NEXT: [[TMP31:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4 -; RV64-UF2-NEXT: [[FADD:%.*]] = fadd float [[TMP31]], 1.000000e+00 +; RV64-UF2-NEXT: [[TMP29:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4 +; RV64-UF2-NEXT: [[FADD:%.*]] = fadd float [[TMP29]], 1.000000e+00 ; RV64-UF2-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]] ; RV64-UF2-NEXT: store float [[FADD]], ptr [[ARRAYIDX_A]], align 4 ; RV64-UF2-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index 9e77a0ca8bcc9..b1451ba8c3bdc 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -38,7 +38,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4 +; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom ; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4 @@ -74,15 +74,15 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1> -; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1> -; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> -; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, vp<[[VF]]> -; CHECK-NEXT: WIDEN ir<%1> = load vp<[[VEC_PTR]]> -; CHECK-NEXT: WIDEN ir<%add9> = add ir<%1>, ir<1> -; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> -; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, vp<[[VF]]> -; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add9> +; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> +; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> +; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = vp<[[VF]]> +; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1> +; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, vp<[[VF]]> +; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors @@ -113,7 +113,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4 +; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom ; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4 @@ -144,7 +144,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class ; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class ; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: LV: Loop cost is 32 +; CHECK-NEXT: LV: Loop cost is 27 ; CHECK-NEXT: LV: IC is 1 ; CHECK-NEXT: LV: VF is vscale x 4 ; CHECK-NEXT: LV: Not Interleaving. @@ -154,7 +154,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK: Executing best plan with VF=vscale x 4, UF=1 ; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF={1}' { ; CHECK-NEXT: Live-in ir<[[VF:%.+]]> = VF -; CHECK-NEXT: Live-in ir<[[VFxUF:%.+]]>.1 = VF * UF +; CHECK-NEXT: Live-in ir<[[VF]]>.1 = VF * UF ; CHECK-NEXT: Live-in ir<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: ir<%0> = original trip-count ; CHECK-EMPTY: @@ -199,16 +199,16 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ] ; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> -; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[DEV_IV]]>, ir<-1> -; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> -; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, ir<[[VF]]> -; CHECK-NEXT: WIDEN ir<[[L:%.+]]> = load vp<[[VEC_PTR]]> -; CHECK-NEXT: WIDEN ir<%add9> = add ir<[[L]]>, ir<1> -; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> -; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, ir<[[VF]]> -; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add9> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>.1 +; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[DEV_IV]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> +; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> +; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = ir<[[VF]]> +; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1> +; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, ir<[[VF]]> +; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VF]]>.1 ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } @@ -289,7 +289,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4 +; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom ; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4 @@ -325,15 +325,15 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1> -; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1> -; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> -; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, vp<[[VF]]> -; CHECK-NEXT: WIDEN ir<%1> = load vp<[[VEC_PTR]]> -; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<%1>, ir<1.000000e+00> -; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> -; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, vp<[[VF]]> -; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%conv1> +; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> +; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> +; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = vp<[[VF]]> +; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = fadd ir<[[LD]]>, ir<1.000000e+00> +; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, vp<[[VF]]> +; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors @@ -364,7 +364,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4 +; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom ; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4 @@ -395,7 +395,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class ; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class ; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: LV: Loop cost is 34 +; CHECK-NEXT: LV: Loop cost is 29 ; CHECK-NEXT: LV: IC is 1 ; CHECK-NEXT: LV: VF is vscale x 4 ; CHECK-NEXT: LV: Not Interleaving. @@ -405,7 +405,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK: Executing best plan with VF=vscale x 4, UF=1 ; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF={1}' { ; CHECK-NEXT: Live-in ir<[[VF:%.+]]> = VF -; CHECK-NEXT: Live-in ir<[[VFxUF:%.+]]>.1 = VF * UF +; CHECK-NEXT: Live-in ir<[[VF]]>.1 = VF * UF ; CHECK-NEXT: Live-in ir<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: ir<%0> = original trip-count ; CHECK-EMPTY: @@ -450,16 +450,16 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ] ; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> -; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[DEV_IV]]>, ir<-1> -; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> -; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, ir<[[VF]]> -; CHECK-NEXT: WIDEN ir<[[L:%.+]]> = load vp<[[VEC_PTR]]> -; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<[[L]]>, ir<1.000000e+00> -; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> -; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, ir<[[VF]]> -; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%conv1> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>.1 +; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[DEV_IV]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> +; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> +; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = ir<[[VF]]> +; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = fadd ir<[[LD]]>, ir<1.000000e+00> +; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, ir<[[VF]]> +; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VF]]>.1 ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll index 7bdb67b8a0fba..251cf7c01c976 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll @@ -29,37 +29,32 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt ; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 1024, [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL:%.*]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], -1 -; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP7]] -; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 0, [[TMP18]] -; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 1, [[TMP18]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP9]] -; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP10]] -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]] -; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP19]] -; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP19]] -; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]] -; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]] -; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE3]], ptr align 4 [[TMP17]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], -1 +; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP6]] +; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; IF-EVL-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP8]], i64 -4, splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP6]] +; IF-EVL-NEXT: [[TMP10:%.*]] = zext i32 [[TMP5]] to i64 +; IF-EVL-NEXT: [[TMP11:%.*]] = mul i64 0, [[TMP10]] +; IF-EVL-NEXT: [[TMP12:%.*]] = sub i64 1, [[TMP10]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP9]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP12]] +; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[WIDE_STRIDED_LOAD]], splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE]], ptr align 4 [[TMP14]], splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = zext i32 [[TMP5]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] -; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br label [[LOOPEND:%.*]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[STARTVAL]], [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ 0, [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[ADD_PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ADD]] = add i64 [[ADD_PHI]], -1 ; IF-EVL-NEXT: [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[ADD]] ; IF-EVL-NEXT: [[TMP:%.*]] = load i32, ptr [[GEPL]], align 4 @@ -127,35 +122,29 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal ; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 1024, [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL:%.*]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[OFFSET_IDX3:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32 -; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], -1 -; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[OFFSET_IDX3]] -; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[TMP14:%.*]] = icmp slt [[VP_OP_LOAD]], splat (i32 100) -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP26:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP26]] -; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP26]] -; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP17]] -; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP18]] -; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP14]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], [[VP_REVERSE_MASK]], i32 [[TMP5]]) -; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD4]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP27]] -; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP27]] -; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP22]] -; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]] -; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP14]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE5]], ptr align 4 [[TMP25]], [[VP_REVERSE_MASK6]], i32 [[TMP5]]) -; IF-EVL-NEXT: [[TMP28:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP28]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[OFFSET_IDX1:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32 +; IF-EVL-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], -1 +; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[OFFSET_IDX1]] +; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP8]], splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: [[TMP9:%.*]] = icmp slt [[VP_OP_LOAD]], splat (i32 100) +; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[PTR1:%.*]], i64 [[TMP6]] +; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0 +; IF-EVL-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP11]], i64 -4, [[TMP9]], i32 [[TMP5]]) +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP6]] +; IF-EVL-NEXT: [[TMP13:%.*]] = zext i32 [[TMP5]] to i64 +; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP13]] +; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP13]] +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP12]], i64 [[TMP14]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP15]] +; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[WIDE_STRIDED_LOAD]], splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP9]], splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE]], ptr align 4 [[TMP17]], [[VP_REVERSE_MASK]], i32 [[TMP5]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP5]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP18]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] -; IF-EVL-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br label [[LOOPEND:%.*]] ; IF-EVL: scalar.ph: From 991267f18c73c653d1074dc17f17364dbcb4d106 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Wed, 14 May 2025 02:02:08 -0700 Subject: [PATCH 2/7] [WIP][VPlan Based] Try to remove CM_Strided from uniform analysis Also cherry-pick the branch Mel-Chen:legalizeAndOptimizeInductions However, still not work well as collectLoopUniforms if the use-chain is too compilicated. :( --- .../Transforms/Vectorize/LoopVectorize.cpp | 6 +- .../Transforms/Vectorize/VPlanTransforms.cpp | 22 ++-- .../RISCV/riscv-vector-reverse.ll | 110 ++++++++---------- 3 files changed, 62 insertions(+), 76 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index a12de7aed46ac..37a9050b1e259 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3415,9 +3415,9 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { if (IsUniformMemOpUse(I)) return true; - return ( - WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse || - WideningDecision == CM_Strided || WideningDecision == CM_Interleave); + return (WideningDecision == CM_Widen || + WideningDecision == CM_Widen_Reverse || + WideningDecision == CM_Interleave); }; // Returns true if Ptr is the pointer operand of a memory access instruction diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 7958f0e902e1a..1577d1413186f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -626,13 +626,14 @@ static SmallVector collectUsersRecursively(VPValue *V) { static void legalizeAndOptimizeInductions(VPlan &Plan) { using namespace llvm::VPlanPatternMatch; VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); - bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly(); - VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi()); - for (VPRecipeBase &Phi : HeaderVPBB->phis()) { - auto *PhiR = dyn_cast(&Phi); - if (!PhiR) - continue; + SmallVector InductionPhis; + for (VPRecipeBase &R : HeaderVPBB->phis()) + if (auto *IV = dyn_cast(&R)) + InductionPhis.push_back(IV); + bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly(); + VPBuilder Builder; + for (VPWidenInductionRecipe *PhiR : reverse(InductionPhis)) { // Try to narrow wide and replicating recipes to uniform recipes, based on // VPlan analysis. // TODO: Apply to all recipes in the future, to replace legacy uniformity @@ -642,7 +643,8 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) { auto *Def = dyn_cast(U); auto *RepR = dyn_cast(U); // Skip recipes that shouldn't be narrowed. - if (!Def || !isa(Def) || + if (!Def || + !isa(Def) || Def->getNumUsers() == 0 || !Def->getUnderlyingValue() || (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))) continue; @@ -655,11 +657,13 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) { Def->operands(), /*IsUniform*/ true); Clone->insertAfter(Def); Def->replaceAllUsesWith(Clone); + Def->eraseFromParent(); } + Builder.setInsertPoint(HeaderVPBB, HeaderVPBB->getFirstNonPhi()); // Replace wide pointer inductions which have only their scalars used by // PtrAdd(IndStart, ScalarIVSteps (0, Step)). - if (auto *PtrIV = dyn_cast(&Phi)) { + if (auto *PtrIV = dyn_cast(PhiR)) { if (!PtrIV->onlyScalarsGenerated(Plan.hasScalableVF())) continue; @@ -680,7 +684,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) { // Replace widened induction with scalar steps for users that only use // scalars. - auto *WideIV = cast(&Phi); + auto *WideIV = cast(PhiR); if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) { return U->usesScalars(WideIV); })) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index b1451ba8c3bdc..9da3abe53abe9 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -25,18 +25,13 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: The max safe fixed VF is: 67108864. ; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 4294967295. ; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom ; CHECK-NEXT: LV: Found uniform instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] ; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found uniform instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found uniform instruction: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 +; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 +; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom ; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1 @@ -47,9 +42,6 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 ; CHECK-NEXT: LV: Using user VF vscale x 4. ; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom ; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom ; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1 ; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1 @@ -72,10 +64,9 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION -; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> -; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1> -; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1> -; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> +; CHECK-NEXT: ir<[[WIDEN_IV:%.+]]> = WIDEN-INDUCTION ir<%n>, ir<-1>, vp<[[VF]]> +; CHECK-NEXT: WIDEN ir<[[IDX:%.+]]> = add nsw ir<[[WIDEN_IV]]>, ir<-1> +; CHECK-NEXT: WIDEN-CAST ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> to i64 ; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> ; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> ; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = vp<[[VF]]> @@ -110,8 +101,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: } ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 +; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 +; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom ; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1 @@ -124,27 +115,26 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV(REG): At #0 Interval # 0 ; CHECK-NEXT: LV(REG): At #1 Interval # 1 ; CHECK-NEXT: LV(REG): At #2 Interval # 2 -; CHECK-NEXT: LV(REG): At #3 Interval # 2 -; CHECK-NEXT: LV(REG): At #4 Interval # 2 -; CHECK-NEXT: LV(REG): At #5 Interval # 2 -; CHECK-NEXT: LV(REG): At #6 Interval # 3 -; CHECK-NEXT: LV(REG): At #7 Interval # 3 -; CHECK-NEXT: LV(REG): At #8 Interval # 3 -; CHECK-NEXT: LV(REG): At #9 Interval # 3 -; CHECK-NEXT: LV(REG): At #10 Interval # 3 +; CHECK-NEXT: LV(REG): At #3 Interval # 3 +; CHECK-NEXT: LV(REG): At #4 Interval # 3 +; CHECK-NEXT: LV(REG): At #5 Interval # 4 +; CHECK-NEXT: LV(REG): At #6 Interval # 4 +; CHECK-NEXT: LV(REG): At #7 Interval # 4 +; CHECK-NEXT: LV(REG): At #8 Interval # 4 +; CHECK-NEXT: LV(REG): At #9 Interval # 4 +; CHECK-NEXT: LV(REG): At #10 Interval # 4 ; CHECK-NEXT: LV(REG): At #11 Interval # 3 -; CHECK-NEXT: LV(REG): At #12 Interval # 2 -; CHECK-NEXT: LV(REG): At #13 Interval # 2 +; CHECK-NEXT: LV(REG): At #12 Interval # 3 ; CHECK-NEXT: LV(REG): VF = vscale x 4 ; CHECK-NEXT: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers -; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 8 registers ; CHECK-NEXT: LV(REG): Found invariant usage: 1 item ; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers ; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class ; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class ; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: LV: Loop cost is 27 +; CHECK-NEXT: LV: Loop cost is 31 ; CHECK-NEXT: LV: IC is 1 ; CHECK-NEXT: LV: VF is vscale x 4 ; CHECK-NEXT: LV: Not Interleaving. @@ -193,14 +183,15 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: IR %18 = mul i64 %17, 4 ; CHECK-NEXT: vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + ir<[[VEC_TC]]> * ir<-1> ; CHECK-NEXT: vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + ir<[[VEC_TC]]> * ir<-1> +; CHECK-NEXT: EMIT vp<[[STEP:%.+]]> = step-vector i32 ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ] -; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> -; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[DEV_IV]]>, ir<-1> -; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> +; CHECK-NEXT: ir<[[WIDEN_IV:%.+]]> = WIDEN-INDUCTION ir<%n>, ir<-1>, ir<%18>, vp<[[STEP]]> +; CHECK-NEXT: WIDEN ir<[[IDX:%.+]]> = add nsw ir<[[WIDEN_IV]]>, ir<-1> +; CHECK-NEXT: WIDEN-CAST ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> to i64 ; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> ; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> ; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = ir<[[VF]]> @@ -276,18 +267,13 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: The max safe fixed VF is: 67108864. ; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 4294967295. ; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom ; CHECK-NEXT: LV: Found uniform instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] ; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found uniform instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found uniform instruction: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 +; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 +; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom ; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00 @@ -298,9 +284,6 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 ; CHECK-NEXT: LV: Using user VF vscale x 4. ; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom ; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom ; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1 ; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1 @@ -323,10 +306,9 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION -; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> -; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1> -; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1> -; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> +; CHECK-NEXT: ir<[[WIDEN_IV:%.+]]> = WIDEN-INDUCTION ir<%n>, ir<-1>, vp<[[VF]]> +; CHECK-NEXT: WIDEN ir<[[IDX:%.+]]> = add nsw ir<[[WIDEN_IV]]>, ir<-1> +; CHECK-NEXT: WIDEN-CAST ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> to i64 ; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> ; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> ; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = vp<[[VF]]> @@ -361,8 +343,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: } ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 +; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 +; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom ; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00 @@ -375,27 +357,26 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV(REG): At #0 Interval # 0 ; CHECK-NEXT: LV(REG): At #1 Interval # 1 ; CHECK-NEXT: LV(REG): At #2 Interval # 2 -; CHECK-NEXT: LV(REG): At #3 Interval # 2 -; CHECK-NEXT: LV(REG): At #4 Interval # 2 -; CHECK-NEXT: LV(REG): At #5 Interval # 2 -; CHECK-NEXT: LV(REG): At #6 Interval # 3 -; CHECK-NEXT: LV(REG): At #7 Interval # 3 -; CHECK-NEXT: LV(REG): At #8 Interval # 3 -; CHECK-NEXT: LV(REG): At #9 Interval # 3 -; CHECK-NEXT: LV(REG): At #10 Interval # 3 +; CHECK-NEXT: LV(REG): At #3 Interval # 3 +; CHECK-NEXT: LV(REG): At #4 Interval # 3 +; CHECK-NEXT: LV(REG): At #5 Interval # 4 +; CHECK-NEXT: LV(REG): At #6 Interval # 4 +; CHECK-NEXT: LV(REG): At #7 Interval # 4 +; CHECK-NEXT: LV(REG): At #8 Interval # 4 +; CHECK-NEXT: LV(REG): At #9 Interval # 4 +; CHECK-NEXT: LV(REG): At #10 Interval # 4 ; CHECK-NEXT: LV(REG): At #11 Interval # 3 -; CHECK-NEXT: LV(REG): At #12 Interval # 2 -; CHECK-NEXT: LV(REG): At #13 Interval # 2 +; CHECK-NEXT: LV(REG): At #12 Interval # 3 ; CHECK-NEXT: LV(REG): VF = vscale x 4 ; CHECK-NEXT: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers -; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 8 registers ; CHECK-NEXT: LV(REG): Found invariant usage: 1 item ; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers ; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class ; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class ; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: LV: Loop cost is 29 +; CHECK-NEXT: LV: Loop cost is 33 ; CHECK-NEXT: LV: IC is 1 ; CHECK-NEXT: LV: VF is vscale x 4 ; CHECK-NEXT: LV: Not Interleaving. @@ -444,14 +425,15 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: IR %18 = mul i64 %17, 4 ; CHECK-NEXT: vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + ir<[[VEC_TC]]> * ir<-1> ; CHECK-NEXT: vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + ir<[[VEC_TC]]> * ir<-1> +; CHECK-NEXT: EMIT vp<[[STEP:%.+]]> = step-vector i32 ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ] -; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> -; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[DEV_IV]]>, ir<-1> -; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> +; CHECK-NEXT: ir<[[WIDEN_IV:%.+]]> = WIDEN-INDUCTION ir<%n>, ir<-1>, ir<%18>, vp<[[STEP]]> +; CHECK-NEXT: WIDEN ir<[[IDX:%.+]]> = add nsw ir<[[WIDEN_IV]]>, ir<-1> +; CHECK-NEXT: WIDEN-CAST ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> to i64 ; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> ; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> ; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = ir<[[VF]]> From d967afe8edbe630c89d1351e2896d8c6f7588d2a Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Tue, 20 May 2025 00:42:15 -0700 Subject: [PATCH 3/7] [WIP][VPlan Based] Generate VPWidenStrideLoadRecipe in VPlanTransform Still rely on CM_Strided to known legal and cost. --- .../Transforms/Vectorize/LoopVectorize.cpp | 52 +++++-------------- .../Transforms/Vectorize/VPlanTransforms.cpp | 49 +++++++++++++++++ .../Transforms/Vectorize/VPlanTransforms.h | 4 ++ 3 files changed, 66 insertions(+), 39 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 37a9050b1e259..1d577d3ee0f6a 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1333,14 +1333,9 @@ class LoopVectorizationCostModel { /// that can be vectorized. bool stridedAccessCanBeWidened(Instruction *I, ElementCount VF) const; - /// Get the stride of the strided memory access instruction \p Instr. Return 0 - /// if the instruction \p Instr is not considered for vectorization as a - /// strided memory access. - int64_t getStride(Instruction *Instr) const { - auto It = StrideInfo.find(Instr); - if (It != StrideInfo.end()) - return It->second; - return 0; + /// Get the stride information of the strided memory accesses. + SmallDenseMap getStrideInfo() const { + return StrideInfo; } /// Returns true if we're required to use a scalar epilogue for at least @@ -1741,7 +1736,7 @@ class LoopVectorizationCostModel { } /// The mapping of memory access instructions to their stride values. - DenseMap StrideInfo; + SmallDenseMap StrideInfo; public: /// The loop that we evaluate. @@ -8307,27 +8302,16 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, // reverse consecutive. LoopVectorizationCostModel::InstWidening Decision = CM.getWideningDecision(I, Range.Start); - - auto SameWiden = [&](ElementCount VF) -> bool { - return Decision == CM.getWideningDecision(I, VF); - }; - bool ContainsWidenVF = - LoopVectorizationPlanner::getDecisionAndClampRange(SameWiden, Range); - assert(ContainsWidenVF && - "At least widen the memory accesses by the Start VF."); - bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; bool Consecutive = Reverse || Decision == LoopVectorizationCostModel::CM_Widen; - bool Strided = Decision == LoopVectorizationCostModel::CM_Strided; VPValue *Ptr = isa(I) ? Operands[0] : Operands[1]; - if (Consecutive || Strided) { + if (Consecutive) { auto *GEP = dyn_cast( Ptr->getUnderlyingValue()->stripPointerCasts()); VPSingleDefRecipe *VectorPtr; if (Reverse) { - assert(!Strided && "Reverse and Strided are mutually exclusive."); // When folding the tail, we may compute an address that we don't in the // original scalar loop and it may not be inbounds. Drop Inbounds in that // case. @@ -8338,30 +8322,17 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, VectorPtr = new VPVectorEndPointerRecipe( Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc()); } else { - VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), Strided, - GEP ? GEP->getNoWrapFlags() - : GEPNoWrapFlags::none(), - I->getDebugLoc()); + VectorPtr = new VPVectorPointerRecipe( + Ptr, getLoadStoreType(I), /*Strided*/ false, + GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(), + I->getDebugLoc()); } Builder.insert(VectorPtr); Ptr = VectorPtr; } - if (LoadInst *Load = dyn_cast(I)) { - if (Strided) { - const DataLayout &DL = Load->getDataLayout(); - auto *StrideTy = DL.getIndexType(Load->getPointerOperand()->getType()); - int64_t Stride = CM.getStride(Load); - assert(Stride == -1 && - "Only stride memory access with a stride of -1 is supported."); - VPValue *StrideVPV = Plan.getOrAddLiveIn(ConstantInt::get( - StrideTy, Stride * DL.getTypeAllocSize(getLoadStoreType(Load)))); - return new VPWidenStridedLoadRecipe(*Load, Ptr, StrideVPV, &Plan.getVF(), - Mask, VPIRMetadata(*Load, LVer), - I->getDebugLoc()); - } + if (LoadInst *Load = dyn_cast(I)) return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, VPIRMetadata(*Load, LVer), I->getDebugLoc()); - } StoreInst *Store = cast(I); return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive, @@ -9524,6 +9495,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range, VPlanTransforms::runPass(VPlanTransforms::createInterleaveGroups, *Plan, InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed()); + // !!! NEED COMMENT + VPlanTransforms::runPass(VPlanTransforms::convertToStridedAccesses, *Plan, + CM.getStrideInfo()); // Replace VPValues for known constant strides guaranteed by predicate scalar // evolution. diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 1577d1413186f..c3bcec758ccce 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2517,6 +2517,55 @@ void VPlanTransforms::createInterleaveGroups( } } +void VPlanTransforms::convertToStridedAccesses( + VPlan &Plan, const SmallDenseMap &StrideInfo) { + // !!! FIXME: Should remove StrideInfo for next step. + if (Plan.hasScalarVFOnly() || StrideInfo.empty()) + return; + + // !!! FIXME: Should clamp VF for legal and cost in next step + SmallVector ToErase; + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( + vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) { + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + // !!! FIXME: Should use LoadR->isReverse() for next step + if (auto *LoadR = dyn_cast(&R); + LoadR && !LoadR->isConsecutive()) { + auto *LI = cast(&LoadR->getIngredient()); + auto It = StrideInfo.find(LI); + if (It == StrideInfo.end()) + continue; + int64_t Stride = It->second; + assert(Stride == -1 && + "Only stride memory access with a stride of -1 is supported."); + // !!! FIXME: Should get VPVectorEndPointerRecipe for reverse + VPValue *Ptr = LoadR->getAddr(); + auto *GEP = dyn_cast( + Ptr->getUnderlyingValue()->stripPointerCasts()); + auto *NewPtr = new VPVectorPointerRecipe( + Ptr, getLoadStoreType(LI), /*Stride*/ true, + GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(), + LoadR->getDebugLoc()); + NewPtr->insertBefore(LoadR); + + const DataLayout &DL = LI->getDataLayout(); + auto *StrideTy = DL.getIndexType(LI->getPointerOperand()->getType()); + VPValue *StrideVPV = Plan.getOrAddLiveIn(ConstantInt::get( + StrideTy, Stride * DL.getTypeAllocSize(getLoadStoreType(LI)))); + auto *StridedLoad = new VPWidenStridedLoadRecipe( + *LI, NewPtr, StrideVPV, &Plan.getVF(), LoadR->getMask(), *LoadR, + LoadR->getDebugLoc()); + StridedLoad->insertBefore(LoadR); + LoadR->replaceAllUsesWith(StridedLoad); + ToErase.push_back(LoadR); + } + } + } + + for (VPRecipeBase *R : ToErase) + R->eraseFromParent(); +} + // Expand VPExtendedReductionRecipe to VPWidenCastRecipe + VPReductionRecipe. static void expandVPExtendedReduction(VPExtendedReductionRecipe *ExtRed) { VPWidenCastRecipe *Ext; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 3a1ed7406b383..94fb79be1521e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -171,6 +171,10 @@ struct VPlanTransforms { &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed); + // !!! NEED COMMENT + static void convertToStridedAccesses( + VPlan &Plan, const SmallDenseMap &StrideInfo); + /// Remove dead recipes from \p Plan. static void removeDeadRecipes(VPlan &Plan); From a0b61ad94811fe286be3ee4ec88c86d6edf4be76 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Wed, 21 May 2025 00:10:24 -0700 Subject: [PATCH 4/7] [WIP][VPlan based] Clamp VF range in VPlan transformation --- .../Transforms/Vectorize/LoopVectorize.cpp | 79 ++---------- llvm/lib/Transforms/Vectorize/VPlan.h | 6 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 27 ++-- .../Transforms/Vectorize/VPlanTransforms.cpp | 92 ++++++++------ .../Transforms/Vectorize/VPlanTransforms.h | 4 +- .../RISCV/riscv-vector-reverse.ll | 118 ++++++++++-------- 6 files changed, 146 insertions(+), 180 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 1d577d3ee0f6a..f9c6ea06fa9e1 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1329,15 +1329,6 @@ class LoopVectorizationCostModel { return InterleaveInfo.getInterleaveGroup(Instr); } - /// Returns true if \p I is a memory instruction with strided memory access - /// that can be vectorized. - bool stridedAccessCanBeWidened(Instruction *I, ElementCount VF) const; - - /// Get the stride information of the strided memory accesses. - SmallDenseMap getStrideInfo() const { - return StrideInfo; - } - /// Returns true if we're required to use a scalar epilogue for at least /// the final iteration of the original loop. bool requiresScalarEpilogue(bool IsVectorizing) const { @@ -1592,10 +1583,6 @@ class LoopVectorizationCostModel { /// element) InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); - /// The cost computation for strided load/store instruction. - InstructionCost getStridedLoadStoreCost(Instruction *I, - ElementCount VF) const; - /// Estimate the overhead of scalarizing an instruction. This is a /// convenience wrapper for the type-based getScalarizationOverhead API. InstructionCost getScalarizationOverhead(Instruction *I, @@ -1735,9 +1722,6 @@ class LoopVectorizationCostModel { Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); } - /// The mapping of memory access instructions to their stride values. - SmallDenseMap StrideInfo; - public: /// The loop that we evaluate. Loop *TheLoop; @@ -3295,31 +3279,6 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( return true; } -bool LoopVectorizationCostModel::stridedAccessCanBeWidened( - Instruction *I, ElementCount VF) const { - // Get and ensure we have a valid memory instruction. - assert((isa(I)) && "Invalid memory instruction"); - - // Only support strided access for vector VF. - if (!VF.isVector()) - return false; - - // FIXME: Remove this check for StoreInst after strided store is supported. - if (isa(I)) - return false; - - [[maybe_unused]] auto *Ptr = getLoadStorePointerOperand(I); - auto *ScalarTy = getLoadStoreType(I); - // TODO: Support non-unit-reverse strided accesses. Add stride analysis here - // to ensure that the accessed addresses are evenly spaced apart by a fixed - // stride. - assert(Legal->isConsecutivePtr(ScalarTy, Ptr) == -1 && - "Only supports strided accesses with a stride of -1"); - - const Align Alignment = getLoadStoreAlignment(I); - return TTI.isLegalStridedLoadStore(toVectorTy(ScalarTy, VF), Alignment); -} - void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // We should not collect Uniforms more than once per VF. Right now, // this function is called from collectUniformsAndScalars(), which @@ -5723,19 +5682,6 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, return Cost; } -InstructionCost -LoopVectorizationCostModel::getStridedLoadStoreCost(Instruction *I, - ElementCount VF) const { - Type *ValTy = getLoadStoreType(I); - auto *VectorTy = cast(toVectorTy(ValTy, VF)); - const Align Alignment = getLoadStoreAlignment(I); - const Value *Ptr = getLoadStorePointerOperand(I); - - return TTI.getStridedMemoryOpCost(I->getOpcode(), VectorTy, Ptr, - Legal->isMaskRequired(I), Alignment, - CostKind, I); -} - std::optional LoopVectorizationCostModel::getReductionPatternCost(Instruction *I, ElementCount VF, @@ -6055,17 +6001,6 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { "Expected consecutive stride."); InstWidening Decision = ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; - // Consider using strided load/store for consecutive reverse accesses to - // achieve more efficient memory operations. - if (ConsecutiveStride == -1 && stridedAccessCanBeWidened(&I, VF)) { - const InstructionCost StridedLoadStoreCost = - getStridedLoadStoreCost(&I, VF); - if (StridedLoadStoreCost < Cost) { - Decision = CM_Strided; - Cost = StridedLoadStoreCost; - StrideInfo[&I] = ConsecutiveStride; - } - } setWideningDecision(&I, VF, Decision, Cost); continue; } @@ -9478,12 +9413,15 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range, // clamp the range for better cost estimation. // TODO: Enable following transform when the EVL-version of extended-reduction // and mulacc-reduction are implemented. - if (!CM.foldTailWithEVL()) { - VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM, - CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM, + CM.CostKind); + if (!CM.foldTailWithEVL()) VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx, Range); - } + + // !!! NEED COMMENT + VPlanTransforms::runPass(VPlanTransforms::convertToStridedAccesses, *Plan, + CostCtx, Range); for (ElementCount VF : Range) Plan->addVF(VF); @@ -9495,9 +9433,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range, VPlanTransforms::runPass(VPlanTransforms::createInterleaveGroups, *Plan, InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed()); - // !!! NEED COMMENT - VPlanTransforms::runPass(VPlanTransforms::convertToStridedAccesses, *Plan, - CM.getStrideInfo()); // Replace VPValues for known constant strides guaranteed by predicate scalar // evolution. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index a579191e5c4a6..773865ca72b0b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1718,6 +1718,8 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags, VP_CLASSOF_IMPL(VPDef::VPVectorEndPointerSC) + VPValue *getPtr() const { return getOperand(0); } + VPValue *getVFValue() { return getOperand(1); } const VPValue *getVFValue() const { return getOperand(1); } @@ -3161,10 +3163,6 @@ struct VPWidenStridedLoadRecipe final : public VPWidenMemoryRecipe, /// Generate a strided load. void execute(VPTransformState &State) override; - /// Return the cost of this VPWidenStridedLoadRecipe. - InstructionCost computeCost(ElementCount VF, - VPCostContext &Ctx) const override; - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 724d1d2f68aac..fbd39ee73e41a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2316,7 +2316,7 @@ void VPVectorEndPointerRecipe::execute(VPTransformState &State) { ConstantInt::get(IndexTy, -(int64_t)CurrentPart), RunTimeVF); // LastLane = 1 - RunTimeVF Value *LastLane = Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); - Value *Ptr = State.get(getOperand(0), VPLane(0)); + Value *Ptr = State.get(getPtr(), VPLane(0)); Value *ResultPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags()); ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "", @@ -2923,9 +2923,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, getLoadStoreAlignment(const_cast(&Ingredient)); unsigned AS = cast(Ctx.Types.inferScalarType(getAddr())) ->getAddressSpace(); - unsigned Opcode = isa(this) - ? Instruction::Load - : Instruction::Store; + unsigned Opcode = + isa( + this) + ? Instruction::Load + : Instruction::Store; if (!Consecutive) { // TODO: Using the original IR may not be accurate. @@ -2934,6 +2936,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, const Value *Ptr = getLoadStorePointerOperand(&Ingredient); assert(!Reverse && "Inconsecutive memory access should not have the order."); + + if (isa(this)) + return Ctx.TTI.getStridedMemoryOpCost( + Opcode, Ty, Ptr, IsMasked, Alignment, Ctx.CostKind, &Ingredient); + return Ctx.TTI.getAddressComputationCost(Ty) + Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment, Ctx.CostKind, &Ingredient); @@ -3128,18 +3135,6 @@ void VPWidenStridedLoadRecipe::print(raw_ostream &O, const Twine &Indent, } #endif -InstructionCost -VPWidenStridedLoadRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); - const Align Alignment = getLoadStoreAlignment(&Ingredient); - const Value *Ptr = getLoadStorePointerOperand(&Ingredient); - - return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty, Ptr, - IsMasked, Alignment, Ctx.CostKind, - &Ingredient); -} - void VPWidenStoreRecipe::execute(VPTransformState &State) { VPValue *StoredVPValue = getStoredValue(); bool CreateScatter = !isConsecutive(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index c3bcec758ccce..bf7adcc98d78d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2517,48 +2517,68 @@ void VPlanTransforms::createInterleaveGroups( } } -void VPlanTransforms::convertToStridedAccesses( - VPlan &Plan, const SmallDenseMap &StrideInfo) { - // !!! FIXME: Should remove StrideInfo for next step. - if (Plan.hasScalarVFOnly() || StrideInfo.empty()) +void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx, + VFRange &Range) { + if (Plan.hasScalarVFOnly()) return; - // !!! FIXME: Should clamp VF for legal and cost in next step SmallVector ToErase; for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) { for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { - // !!! FIXME: Should use LoadR->isReverse() for next step - if (auto *LoadR = dyn_cast(&R); - LoadR && !LoadR->isConsecutive()) { - auto *LI = cast(&LoadR->getIngredient()); - auto It = StrideInfo.find(LI); - if (It == StrideInfo.end()) - continue; - int64_t Stride = It->second; - assert(Stride == -1 && - "Only stride memory access with a stride of -1 is supported."); - // !!! FIXME: Should get VPVectorEndPointerRecipe for reverse - VPValue *Ptr = LoadR->getAddr(); - auto *GEP = dyn_cast( - Ptr->getUnderlyingValue()->stripPointerCasts()); - auto *NewPtr = new VPVectorPointerRecipe( - Ptr, getLoadStoreType(LI), /*Stride*/ true, - GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(), - LoadR->getDebugLoc()); - NewPtr->insertBefore(LoadR); - - const DataLayout &DL = LI->getDataLayout(); - auto *StrideTy = DL.getIndexType(LI->getPointerOperand()->getType()); - VPValue *StrideVPV = Plan.getOrAddLiveIn(ConstantInt::get( - StrideTy, Stride * DL.getTypeAllocSize(getLoadStoreType(LI)))); - auto *StridedLoad = new VPWidenStridedLoadRecipe( - *LI, NewPtr, StrideVPV, &Plan.getVF(), LoadR->getMask(), *LoadR, - LoadR->getDebugLoc()); - StridedLoad->insertBefore(LoadR); - LoadR->replaceAllUsesWith(StridedLoad); - ToErase.push_back(LoadR); - } + auto *MemR = dyn_cast(&R); + // TODO: support strided store + // TODO: support strided accesses with stride not equal to -1 + if (!MemR || !isa(MemR) || !MemR->isReverse()) + continue; + + Instruction &Ingredient = MemR->getIngredient(); + Type *ElementTy = getLoadStoreType(&Ingredient); + + auto IsProfitable = [&](ElementCount VF) -> bool { + Type *DataTy = toVectorTy(ElementTy, VF); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + if (!Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment)) + return false; + const InstructionCost CurrentCost = MemR->computeCost(VF, Ctx); + const InstructionCost StridedLoadStoreCost = + Ctx.TTI.getStridedMemoryOpCost( + Ingredient.getOpcode(), DataTy, + getLoadStorePointerOperand(&Ingredient), MemR->isMasked(), + Alignment, Ctx.CostKind, &Ingredient); + return StridedLoadStoreCost < CurrentCost; + }; + + if (!LoopVectorizationPlanner::getDecisionAndClampRange(IsProfitable, + Range)) + continue; + + // The stride of consecutive reverse access must be -1. + int64_t Stride = -1; + auto *VecEndPtr = cast(MemR->getAddr()); + VPValue *Ptr = VecEndPtr->getPtr(); + auto *GEP = dyn_cast( + Ptr->getUnderlyingValue()->stripPointerCasts()); + // Create a new vector pointer for strided access. + auto *NewPtr = new VPVectorPointerRecipe( + Ptr, ElementTy, /*Stride=*/ true, + GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(), + VecEndPtr->getDebugLoc()); + NewPtr->insertBefore(MemR); + + auto *LoadR = cast(MemR); + auto *LI = cast(&Ingredient); + const DataLayout &DL = LI->getDataLayout(); + auto *StrideTy = DL.getIndexType(LI->getPointerOperand()->getType()); + VPValue *StrideVPV = Plan.getOrAddLiveIn(ConstantInt::get( + StrideTy, Stride * DL.getTypeAllocSize(ElementTy))); + auto *StridedLoad = new VPWidenStridedLoadRecipe( + *LI, NewPtr, StrideVPV, &Plan.getVF(), LoadR->getMask(), *LoadR, + LoadR->getDebugLoc()); + StridedLoad->insertBefore(LoadR); + LoadR->replaceAllUsesWith(StridedLoad); + + ToErase.append({LoadR, VecEndPtr}); } } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 94fb79be1521e..91a4c1fc4cb9a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -172,8 +172,8 @@ struct VPlanTransforms { VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed); // !!! NEED COMMENT - static void convertToStridedAccesses( - VPlan &Plan, const SmallDenseMap &StrideInfo); + static void convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx, + VFRange &Range); /// Remove dead recipes from \p Plan. static void removeDeadRecipes(VPlan &Plan); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index 9da3abe53abe9..7382fffdbb0ed 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -25,15 +25,20 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: The max safe fixed VF is: 67108864. ; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 4294967295. ; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1 +; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom ; CHECK-NEXT: LV: Found uniform instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom +; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 +; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] ; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 +; CHECK-NEXT: LV: Found uniform instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] +; CHECK-NEXT: LV: Found uniform instruction: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 +; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 +; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4 +; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom ; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4 @@ -42,6 +47,9 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 ; CHECK-NEXT: LV: Using user VF vscale x 4. ; CHECK-NEXT: LV: Loop does not require scalar epilogue +; CHECK: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1 +; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64 +; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom ; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom ; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1 ; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1 @@ -64,9 +72,10 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION -; CHECK-NEXT: ir<[[WIDEN_IV:%.+]]> = WIDEN-INDUCTION ir<%n>, ir<-1>, vp<[[VF]]> -; CHECK-NEXT: WIDEN ir<[[IDX:%.+]]> = add nsw ir<[[WIDEN_IV]]>, ir<-1> -; CHECK-NEXT: WIDEN-CAST ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> to i64 +; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> ; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> ; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> ; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = vp<[[VF]]> @@ -101,10 +110,10 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: } ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 +; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 +; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4 +; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom ; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4 @@ -115,26 +124,27 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV(REG): At #0 Interval # 0 ; CHECK-NEXT: LV(REG): At #1 Interval # 1 ; CHECK-NEXT: LV(REG): At #2 Interval # 2 -; CHECK-NEXT: LV(REG): At #3 Interval # 3 -; CHECK-NEXT: LV(REG): At #4 Interval # 3 -; CHECK-NEXT: LV(REG): At #5 Interval # 4 -; CHECK-NEXT: LV(REG): At #6 Interval # 4 -; CHECK-NEXT: LV(REG): At #7 Interval # 4 -; CHECK-NEXT: LV(REG): At #8 Interval # 4 -; CHECK-NEXT: LV(REG): At #9 Interval # 4 -; CHECK-NEXT: LV(REG): At #10 Interval # 4 +; CHECK-NEXT: LV(REG): At #3 Interval # 2 +; CHECK-NEXT: LV(REG): At #4 Interval # 2 +; CHECK-NEXT: LV(REG): At #5 Interval # 2 +; CHECK-NEXT: LV(REG): At #6 Interval # 3 +; CHECK-NEXT: LV(REG): At #7 Interval # 3 +; CHECK-NEXT: LV(REG): At #8 Interval # 3 +; CHECK-NEXT: LV(REG): At #9 Interval # 3 +; CHECK-NEXT: LV(REG): At #10 Interval # 3 ; CHECK-NEXT: LV(REG): At #11 Interval # 3 -; CHECK-NEXT: LV(REG): At #12 Interval # 3 +; CHECK-NEXT: LV(REG): At #12 Interval # 2 +; CHECK-NEXT: LV(REG): At #13 Interval # 2 ; CHECK-NEXT: LV(REG): VF = vscale x 4 ; CHECK-NEXT: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers -; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 8 registers +; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers +; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers ; CHECK-NEXT: LV(REG): Found invariant usage: 1 item ; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers ; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class ; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class ; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: LV: Loop cost is 31 +; CHECK-NEXT: LV: Loop cost is 32 ; CHECK-NEXT: LV: IC is 1 ; CHECK-NEXT: LV: VF is vscale x 4 ; CHECK-NEXT: LV: Not Interleaving. @@ -183,15 +193,14 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: IR %18 = mul i64 %17, 4 ; CHECK-NEXT: vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + ir<[[VEC_TC]]> * ir<-1> ; CHECK-NEXT: vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + ir<[[VEC_TC]]> * ir<-1> -; CHECK-NEXT: EMIT vp<[[STEP:%.+]]> = step-vector i32 ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ] -; CHECK-NEXT: ir<[[WIDEN_IV:%.+]]> = WIDEN-INDUCTION ir<%n>, ir<-1>, ir<%18>, vp<[[STEP]]> -; CHECK-NEXT: WIDEN ir<[[IDX:%.+]]> = add nsw ir<[[WIDEN_IV]]>, ir<-1> -; CHECK-NEXT: WIDEN-CAST ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> to i64 +; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> +; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[DEV_IV]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> ; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> ; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> ; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = ir<[[VF]]> @@ -267,15 +276,20 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: The max safe fixed VF is: 67108864. ; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 4294967295. ; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1 +; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom ; CHECK-NEXT: LV: Found uniform instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom +; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 +; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] ; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 +; CHECK-NEXT: LV: Found uniform instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] +; CHECK-NEXT: LV: Found uniform instruction: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 +; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 +; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4 +; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom ; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4 @@ -284,6 +298,9 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 ; CHECK-NEXT: LV: Using user VF vscale x 4. ; CHECK-NEXT: LV: Loop does not require scalar epilogue +; CHECK: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1 +; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64 +; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom ; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom ; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1 ; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1 @@ -306,9 +323,10 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION -; CHECK-NEXT: ir<[[WIDEN_IV:%.+]]> = WIDEN-INDUCTION ir<%n>, ir<-1>, vp<[[VF]]> -; CHECK-NEXT: WIDEN ir<[[IDX:%.+]]> = add nsw ir<[[WIDEN_IV]]>, ir<-1> -; CHECK-NEXT: WIDEN-CAST ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> to i64 +; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> ; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> ; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> ; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = vp<[[VF]]> @@ -343,10 +361,10 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: } ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 +; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 +; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 8 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4 +; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom ; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4 @@ -357,26 +375,27 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV(REG): At #0 Interval # 0 ; CHECK-NEXT: LV(REG): At #1 Interval # 1 ; CHECK-NEXT: LV(REG): At #2 Interval # 2 -; CHECK-NEXT: LV(REG): At #3 Interval # 3 -; CHECK-NEXT: LV(REG): At #4 Interval # 3 -; CHECK-NEXT: LV(REG): At #5 Interval # 4 -; CHECK-NEXT: LV(REG): At #6 Interval # 4 -; CHECK-NEXT: LV(REG): At #7 Interval # 4 -; CHECK-NEXT: LV(REG): At #8 Interval # 4 -; CHECK-NEXT: LV(REG): At #9 Interval # 4 -; CHECK-NEXT: LV(REG): At #10 Interval # 4 +; CHECK-NEXT: LV(REG): At #3 Interval # 2 +; CHECK-NEXT: LV(REG): At #4 Interval # 2 +; CHECK-NEXT: LV(REG): At #5 Interval # 2 +; CHECK-NEXT: LV(REG): At #6 Interval # 3 +; CHECK-NEXT: LV(REG): At #7 Interval # 3 +; CHECK-NEXT: LV(REG): At #8 Interval # 3 +; CHECK-NEXT: LV(REG): At #9 Interval # 3 +; CHECK-NEXT: LV(REG): At #10 Interval # 3 ; CHECK-NEXT: LV(REG): At #11 Interval # 3 -; CHECK-NEXT: LV(REG): At #12 Interval # 3 +; CHECK-NEXT: LV(REG): At #12 Interval # 2 +; CHECK-NEXT: LV(REG): At #13 Interval # 2 ; CHECK-NEXT: LV(REG): VF = vscale x 4 ; CHECK-NEXT: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers -; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 8 registers +; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers +; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers ; CHECK-NEXT: LV(REG): Found invariant usage: 1 item ; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers ; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class ; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class ; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: LV: Loop cost is 33 +; CHECK-NEXT: LV: Loop cost is 34 ; CHECK-NEXT: LV: IC is 1 ; CHECK-NEXT: LV: VF is vscale x 4 ; CHECK-NEXT: LV: Not Interleaving. @@ -425,15 +444,14 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: IR %18 = mul i64 %17, 4 ; CHECK-NEXT: vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + ir<[[VEC_TC]]> * ir<-1> ; CHECK-NEXT: vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + ir<[[VEC_TC]]> * ir<-1> -; CHECK-NEXT: EMIT vp<[[STEP:%.+]]> = step-vector i32 ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ] -; CHECK-NEXT: ir<[[WIDEN_IV:%.+]]> = WIDEN-INDUCTION ir<%n>, ir<-1>, ir<%18>, vp<[[STEP]]> -; CHECK-NEXT: WIDEN ir<[[IDX:%.+]]> = add nsw ir<[[WIDEN_IV]]>, ir<-1> -; CHECK-NEXT: WIDEN-CAST ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> to i64 +; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> +; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[DEV_IV]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> ; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> ; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> ; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = ir<[[VF]]> From 95a890b305c76e34886f62a6433cac935615ec66 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Wed, 21 May 2025 00:15:28 -0700 Subject: [PATCH 5/7] [WIP][VPlan based] Time to remove CM_Strided --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f9c6ea06fa9e1..ae45c7e88948a 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1095,7 +1095,6 @@ class LoopVectorizationCostModel { CM_Widen_Reverse, // For consecutive accesses with stride -1. CM_Interleave, CM_GatherScatter, - CM_Strided, CM_Scalarize, CM_VectorCall, CM_IntrinsicCall @@ -6652,8 +6651,6 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, return TTI::CastContextHint::Normal; switch (getWideningDecision(I, VF)) { - // TODO: New CastContextHint for strided accesses. - case LoopVectorizationCostModel::CM_Strided: case LoopVectorizationCostModel::CM_GatherScatter: return TTI::CastContextHint::GatherScatter; case LoopVectorizationCostModel::CM_Interleave: From 4d0301a7459adb4619e7bbd14cd36122dfcc81c0 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Wed, 21 May 2025 00:58:56 -0700 Subject: [PATCH 6/7] [VPlan based] Patch comments, nfc --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 7 ++++--- llvm/lib/Transforms/Vectorize/VPlanTransforms.h | 4 +++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index ae45c7e88948a..f5052e69d830b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9406,17 +9406,18 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range, // Adjust the recipes for any inloop reductions. adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start); + VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM, + CM.CostKind); // Transform recipes to abstract recipes if it is legal and beneficial and // clamp the range for better cost estimation. // TODO: Enable following transform when the EVL-version of extended-reduction // and mulacc-reduction are implemented. - VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM, - CM.CostKind); if (!CM.foldTailWithEVL()) VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx, Range); - // !!! NEED COMMENT + // Convert reverse memory recipes to strided access recipes if the strided + // access is legal and profitable. VPlanTransforms::runPass(VPlanTransforms::convertToStridedAccesses, *Plan, CostCtx, Range); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 91a4c1fc4cb9a..bddbc5e87238b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -171,7 +171,9 @@ struct VPlanTransforms { &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed); - // !!! NEED COMMENT + /// Transform reverse memory recipes into strided access recipes when legal + /// and profitable. Clamps \p Range to maintain consistency with widen + /// decisions of \p Plan, and uses \p Ctx to evaluate the cost. static void convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx, VFRange &Range); From f67a62775991b8ed761cab03dc052a330a40cebc Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Wed, 21 May 2025 01:21:22 -0700 Subject: [PATCH 7/7] Format --- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index bf7adcc98d78d..63a5e0961b6da 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2560,18 +2560,18 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx, auto *GEP = dyn_cast( Ptr->getUnderlyingValue()->stripPointerCasts()); // Create a new vector pointer for strided access. - auto *NewPtr = new VPVectorPointerRecipe( - Ptr, ElementTy, /*Stride=*/ true, - GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(), - VecEndPtr->getDebugLoc()); + auto *NewPtr = new VPVectorPointerRecipe(Ptr, ElementTy, /*Stride=*/true, + GEP ? GEP->getNoWrapFlags() + : GEPNoWrapFlags::none(), + VecEndPtr->getDebugLoc()); NewPtr->insertBefore(MemR); auto *LoadR = cast(MemR); auto *LI = cast(&Ingredient); const DataLayout &DL = LI->getDataLayout(); auto *StrideTy = DL.getIndexType(LI->getPointerOperand()->getType()); - VPValue *StrideVPV = Plan.getOrAddLiveIn(ConstantInt::get( - StrideTy, Stride * DL.getTypeAllocSize(ElementTy))); + VPValue *StrideVPV = Plan.getOrAddLiveIn( + ConstantInt::get(StrideTy, Stride * DL.getTypeAllocSize(ElementTy))); auto *StridedLoad = new VPWidenStridedLoadRecipe( *LI, NewPtr, StrideVPV, &Plan.getVF(), LoadR->getMask(), *LoadR, LoadR->getDebugLoc());