diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b2d7c44761f6d..f5052e69d830b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4187,7 +4187,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks( [](const auto *R) { return Instruction::Select; }) .Case( [](const auto *R) { return Instruction::Store; }) - .Case( + .Case( [](const auto *R) { return Instruction::Load; }) .Case( [](const auto *R) { return Instruction::Call; }) @@ -4286,6 +4286,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, case VPDef::VPWidenPointerInductionSC: case VPDef::VPReductionPHISC: case VPDef::VPInterleaveSC: + case VPDef::VPWidenStridedLoadSC: case VPDef::VPWidenLoadEVLSC: case VPDef::VPWidenLoadSC: case VPDef::VPWidenStoreEVLSC: @@ -8253,10 +8254,10 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, VectorPtr = new VPVectorEndPointerRecipe( Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc()); } else { - VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), - GEP ? GEP->getNoWrapFlags() - : GEPNoWrapFlags::none(), - I->getDebugLoc()); + VectorPtr = new VPVectorPointerRecipe( + Ptr, getLoadStoreType(I), /*Strided*/ false, + GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(), + I->getDebugLoc()); } Builder.insert(VectorPtr); Ptr = VectorPtr; @@ -9405,16 +9406,20 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range, // Adjust the recipes for any inloop reductions. adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start); + VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM, + CM.CostKind); // Transform recipes to abstract recipes if it is legal and beneficial and // clamp the range for better cost estimation. // TODO: Enable following transform when the EVL-version of extended-reduction // and mulacc-reduction are implemented. - if (!CM.foldTailWithEVL()) { - VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM, - CM.CostKind); + if (!CM.foldTailWithEVL()) VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx, Range); - } + + // Convert reverse memory recipes to strided access recipes if the strided + // access is legal and profitable. + VPlanTransforms::runPass(VPlanTransforms::convertToStridedAccesses, *Plan, + CostCtx, Range); for (ElementCount VF : Range) Plan->addVF(VF); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index e634de1e17c69..773865ca72b0b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -544,6 +544,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPBranchOnMaskSC: case VPRecipeBase::VPInterleaveSC: case VPRecipeBase::VPIRInstructionSC: + case VPRecipeBase::VPWidenStridedLoadSC: case VPRecipeBase::VPWidenLoadEVLSC: case VPRecipeBase::VPWidenLoadSC: case VPRecipeBase::VPWidenStoreEVLSC: @@ -1717,6 +1718,8 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags, VP_CLASSOF_IMPL(VPDef::VPVectorEndPointerSC) + VPValue *getPtr() const { return getOperand(0); } + VPValue *getVFValue() { return getOperand(1); } const VPValue *getVFValue() const { return getOperand(1); } @@ -1756,16 +1759,21 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags, }; /// A recipe to compute the pointers for widened memory accesses of IndexTy. +/// Supports both consecutive and reverse consecutive accesses. +/// TODO: Support non-unit strided accesses . class VPVectorPointerRecipe : public VPRecipeWithIRFlags, public VPUnrollPartAccessor<1> { Type *IndexedTy; + /// Indicate whether to compute the pointer for strided memory accesses. + bool Strided; + public: - VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, GEPNoWrapFlags GEPFlags, - DebugLoc DL) + VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, bool Strided, + GEPNoWrapFlags GEPFlags, DebugLoc DL) : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef(Ptr), GEPFlags, DL), - IndexedTy(IndexedTy) {} + IndexedTy(IndexedTy), Strided(Strided) {} VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC) @@ -1786,7 +1794,7 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags, } VPVectorPointerRecipe *clone() override { - return new VPVectorPointerRecipe(getOperand(0), IndexedTy, + return new VPVectorPointerRecipe(getOperand(0), IndexedTy, Strided, getGEPNoWrapFlags(), getDebugLoc()); } @@ -3003,7 +3011,8 @@ class VPWidenMemoryRecipe : public VPRecipeBase, public VPIRMetadata { return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC || R->getVPDefID() == VPRecipeBase::VPWidenStoreSC || R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC || - R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC; + R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC || + R->getVPDefID() == VPRecipeBase::VPWidenStridedLoadSC; } static inline bool classof(const VPUser *U) { @@ -3122,6 +3131,52 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { } }; +/// A recipe for strided load operations, using the base address, stride, and an +/// optional mask. This recipe will generate an vp.strided.load intrinsic call +/// to represent memory accesses with a fixed stride. +struct VPWidenStridedLoadRecipe final : public VPWidenMemoryRecipe, + public VPValue { + VPWidenStridedLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Stride, + VPValue *VF, VPValue *Mask, + const VPIRMetadata &Metadata, DebugLoc DL) + : VPWidenMemoryRecipe( + VPDef::VPWidenStridedLoadSC, Load, {Addr, Stride, VF}, + /*Consecutive=*/false, /*Reverse=*/false, Metadata, DL), + VPValue(this, &Load) { + setMask(Mask); + } + + VPWidenStridedLoadRecipe *clone() override { + return new VPWidenStridedLoadRecipe(cast(Ingredient), getAddr(), + getStride(), getVF(), getMask(), *this, + getDebugLoc()); + } + + VP_CLASSOF_IMPL(VPDef::VPWidenStridedLoadSC); + + /// Return the stride operand. + VPValue *getStride() const { return getOperand(1); } + + /// Return the VF operand. + VPValue *getVF() const { return getOperand(2); } + + /// Generate a strided load. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + /// Returns true if the recipe only uses the first lane of operand \p Op. + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return Op == getAddr() || Op == getStride() || Op == getVF(); + } +}; + /// A recipe for widening store operations, using the stored value, the address /// to store to and an optional mask. struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe { diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index ac0f30cb4693c..d3e7455c4f79e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -180,8 +180,10 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) { } Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) { - assert((isa(R)) && - "Store recipes should not define any values"); + assert( + (isa( + R)) && + "Store recipes should not define any values"); return cast(&R->getIngredient())->getType(); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 14ed40f16683a..fbd39ee73e41a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -80,6 +80,7 @@ bool VPRecipeBase::mayWriteToMemory() const { case VPWidenCastSC: case VPWidenGEPSC: case VPWidenIntOrFpInductionSC: + case VPWidenStridedLoadSC: case VPWidenLoadEVLSC: case VPWidenLoadSC: case VPWidenPHISC: @@ -101,6 +102,7 @@ bool VPRecipeBase::mayReadFromMemory() const { switch (getVPDefID()) { case VPInstructionSC: return cast(this)->opcodeMayReadOrWriteFromMemory(); + case VPWidenStridedLoadSC: case VPWidenLoadEVLSC: case VPWidenLoadSC: return true; @@ -182,6 +184,7 @@ bool VPRecipeBase::mayHaveSideEffects() const { } case VPInterleaveSC: return mayWriteToMemory(); + case VPWidenStridedLoadSC: case VPWidenLoadEVLSC: case VPWidenLoadSC: case VPWidenStoreEVLSC: @@ -2313,7 +2316,7 @@ void VPVectorEndPointerRecipe::execute(VPTransformState &State) { ConstantInt::get(IndexTy, -(int64_t)CurrentPart), RunTimeVF); // LastLane = 1 - RunTimeVF Value *LastLane = Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); - Value *Ptr = State.get(getOperand(0), VPLane(0)); + Value *Ptr = State.get(getPtr(), VPLane(0)); Value *ResultPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags()); ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "", @@ -2341,8 +2344,13 @@ void VPVectorPointerRecipe::execute(VPTransformState &State) { Value *Ptr = State.get(getOperand(0), VPLane(0)); Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart); + // TODO: Support non-unit-reverse strided accesses. + Value *Index = + Strided + ? Builder.CreateMul(Increment, ConstantInt::getSigned(IndexTy, -1)) + : Increment; Value *ResultPtr = - Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags()); + Builder.CreateGEP(IndexedTy, Ptr, Index, "", getGEPNoWrapFlags()); State.set(this, ResultPtr, /*IsScalar*/ true); } @@ -2915,9 +2923,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, getLoadStoreAlignment(const_cast(&Ingredient)); unsigned AS = cast(Ctx.Types.inferScalarType(getAddr())) ->getAddressSpace(); - unsigned Opcode = isa(this) - ? Instruction::Load - : Instruction::Store; + unsigned Opcode = + isa( + this) + ? Instruction::Load + : Instruction::Store; if (!Consecutive) { // TODO: Using the original IR may not be accurate. @@ -2926,6 +2936,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, const Value *Ptr = getLoadStorePointerOperand(&Ingredient); assert(!Reverse && "Inconsecutive memory access should not have the order."); + + if (isa(this)) + return Ctx.TTI.getStridedMemoryOpCost( + Opcode, Ty, Ptr, IsMasked, Alignment, Ctx.CostKind, &Ingredient); + return Ctx.TTI.getAddressComputationCost(Ty) + Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment, Ctx.CostKind, &Ingredient); @@ -3078,6 +3093,48 @@ void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, } #endif +void VPWidenStridedLoadRecipe::execute(VPTransformState &State) { + Type *ScalarDataTy = getLoadStoreType(&Ingredient); + auto *DataTy = VectorType::get(ScalarDataTy, State.VF); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + + auto &Builder = State.Builder; + State.setDebugLocFrom(getDebugLoc()); + Value *Addr = State.get(getAddr(), /*IsScalar*/ true); + Value *Stride = State.get(getStride(), /*IsScalar*/ true); + Value *Mask = nullptr; + if (VPValue *VPMask = getMask()) + Mask = State.get(VPMask); + else + Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); + Value *RunTimeVF = Builder.CreateZExtOrTrunc(State.get(getVF(), VPLane(0)), + Builder.getInt32Ty()); + + auto *PtrTy = Addr->getType(); + auto *StrideTy = Stride->getType(); + CallInst *NewLI = Builder.CreateIntrinsic( + Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy}, + {Addr, Stride, Mask, RunTimeVF}, nullptr, "wide.strided.load"); + NewLI->addParamAttr( + 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); + applyMetadata(*NewLI); + State.set(this, NewLI); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPWidenStridedLoadRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "WIDEN "; + printAsOperand(O, SlotTracker); + O << " = load "; + getAddr()->printAsOperand(O, SlotTracker); + O << ", stride = "; + getStride()->printAsOperand(O, SlotTracker); + O << ", runtimeVF = "; + getVF()->printAsOperand(O, SlotTracker); +} +#endif + void VPWidenStoreRecipe::execute(VPTransformState &State) { VPValue *StoredVPValue = getStoredValue(); bool CreateScatter = !isConsecutive(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index bc007589e1539..63a5e0961b6da 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -626,13 +626,14 @@ static SmallVector collectUsersRecursively(VPValue *V) { static void legalizeAndOptimizeInductions(VPlan &Plan) { using namespace llvm::VPlanPatternMatch; VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); - bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly(); - VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi()); - for (VPRecipeBase &Phi : HeaderVPBB->phis()) { - auto *PhiR = dyn_cast(&Phi); - if (!PhiR) - continue; + SmallVector InductionPhis; + for (VPRecipeBase &R : HeaderVPBB->phis()) + if (auto *IV = dyn_cast(&R)) + InductionPhis.push_back(IV); + bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly(); + VPBuilder Builder; + for (VPWidenInductionRecipe *PhiR : reverse(InductionPhis)) { // Try to narrow wide and replicating recipes to uniform recipes, based on // VPlan analysis. // TODO: Apply to all recipes in the future, to replace legacy uniformity @@ -642,7 +643,8 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) { auto *Def = dyn_cast(U); auto *RepR = dyn_cast(U); // Skip recipes that shouldn't be narrowed. - if (!Def || !isa(Def) || + if (!Def || + !isa(Def) || Def->getNumUsers() == 0 || !Def->getUnderlyingValue() || (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))) continue; @@ -655,11 +657,13 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) { Def->operands(), /*IsUniform*/ true); Clone->insertAfter(Def); Def->replaceAllUsesWith(Clone); + Def->eraseFromParent(); } + Builder.setInsertPoint(HeaderVPBB, HeaderVPBB->getFirstNonPhi()); // Replace wide pointer inductions which have only their scalars used by // PtrAdd(IndStart, ScalarIVSteps (0, Step)). - if (auto *PtrIV = dyn_cast(&Phi)) { + if (auto *PtrIV = dyn_cast(PhiR)) { if (!PtrIV->onlyScalarsGenerated(Plan.hasScalableVF())) continue; @@ -680,7 +684,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) { // Replace widened induction with scalar steps for users that only use // scalars. - auto *WideIV = cast(&Phi); + auto *WideIV = cast(PhiR); if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) { return U->usesScalars(WideIV); })) @@ -2117,6 +2121,12 @@ static VPRecipeBase *createEVLRecipe(VPValue *HeaderMask, VPValue *NewMask = GetNewMask(L->getMask()); return new VPWidenLoadEVLRecipe(*L, EVL, NewMask); }) + .Case([&](VPWidenStridedLoadRecipe *L) { + VPValue *NewMask = GetNewMask(L->getMask()); + return new VPWidenStridedLoadRecipe( + *cast(&L->getIngredient()), L->getAddr(), L->getStride(), + &EVL, NewMask, *L, L->getDebugLoc()); + }) .Case([&](VPWidenStoreRecipe *S) { VPValue *NewMask = GetNewMask(S->getMask()); return new VPWidenStoreEVLRecipe(*S, EVL, NewMask); @@ -2215,7 +2225,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { NumDefVal <= 1 && "Only supports recipes with a single definition or without users."); EVLRecipe->insertBefore(CurRecipe); - if (isa(EVLRecipe)) { + if (isa(EVLRecipe)) { VPValue *CurVPV = CurRecipe->getVPSingleValue(); CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue()); } @@ -2506,6 +2517,75 @@ void VPlanTransforms::createInterleaveGroups( } } +void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx, + VFRange &Range) { + if (Plan.hasScalarVFOnly()) + return; + + SmallVector ToErase; + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( + vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) { + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + auto *MemR = dyn_cast(&R); + // TODO: support strided store + // TODO: support strided accesses with stride not equal to -1 + if (!MemR || !isa(MemR) || !MemR->isReverse()) + continue; + + Instruction &Ingredient = MemR->getIngredient(); + Type *ElementTy = getLoadStoreType(&Ingredient); + + auto IsProfitable = [&](ElementCount VF) -> bool { + Type *DataTy = toVectorTy(ElementTy, VF); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + if (!Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment)) + return false; + const InstructionCost CurrentCost = MemR->computeCost(VF, Ctx); + const InstructionCost StridedLoadStoreCost = + Ctx.TTI.getStridedMemoryOpCost( + Ingredient.getOpcode(), DataTy, + getLoadStorePointerOperand(&Ingredient), MemR->isMasked(), + Alignment, Ctx.CostKind, &Ingredient); + return StridedLoadStoreCost < CurrentCost; + }; + + if (!LoopVectorizationPlanner::getDecisionAndClampRange(IsProfitable, + Range)) + continue; + + // The stride of consecutive reverse access must be -1. + int64_t Stride = -1; + auto *VecEndPtr = cast(MemR->getAddr()); + VPValue *Ptr = VecEndPtr->getPtr(); + auto *GEP = dyn_cast( + Ptr->getUnderlyingValue()->stripPointerCasts()); + // Create a new vector pointer for strided access. + auto *NewPtr = new VPVectorPointerRecipe(Ptr, ElementTy, /*Stride=*/true, + GEP ? GEP->getNoWrapFlags() + : GEPNoWrapFlags::none(), + VecEndPtr->getDebugLoc()); + NewPtr->insertBefore(MemR); + + auto *LoadR = cast(MemR); + auto *LI = cast(&Ingredient); + const DataLayout &DL = LI->getDataLayout(); + auto *StrideTy = DL.getIndexType(LI->getPointerOperand()->getType()); + VPValue *StrideVPV = Plan.getOrAddLiveIn( + ConstantInt::get(StrideTy, Stride * DL.getTypeAllocSize(ElementTy))); + auto *StridedLoad = new VPWidenStridedLoadRecipe( + *LI, NewPtr, StrideVPV, &Plan.getVF(), LoadR->getMask(), *LoadR, + LoadR->getDebugLoc()); + StridedLoad->insertBefore(LoadR); + LoadR->replaceAllUsesWith(StridedLoad); + + ToErase.append({LoadR, VecEndPtr}); + } + } + + for (VPRecipeBase *R : ToErase) + R->eraseFromParent(); +} + // Expand VPExtendedReductionRecipe to VPWidenCastRecipe + VPReductionRecipe. static void expandVPExtendedReduction(VPExtendedReductionRecipe *ExtRed) { VPWidenCastRecipe *Ext; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 3a1ed7406b383..bddbc5e87238b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -171,6 +171,12 @@ struct VPlanTransforms { &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed); + /// Transform reverse memory recipes into strided access recipes when legal + /// and profitable. Clamps \p Range to maintain consistency with widen + /// decisions of \p Plan, and uses \p Ctx to evaluate the cost. + static void convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx, + VFRange &Range); + /// Remove dead recipes from \p Plan. static void removeDeadRecipes(VPlan &Plan); diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 64065edd315f9..b78a406face90 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -344,6 +344,7 @@ class VPDef { VPPartialReductionSC, VPReplicateSC, VPScalarIVStepsSC, + VPWidenStridedLoadSC, VPVectorPointerSC, VPVectorEndPointerSC, VPWidenCallSC, diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 68b35d42e8674..e6294bdf7eb21 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -158,7 +158,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { .Case([&](const VPWidenIntrinsicRecipe *S) { return VerifyEVLUse(*S, S->getNumOperands() - 1); }) - .Case( + .Case( [&](const VPRecipeBase *S) { return VerifyEVLUse(*S, 2); }) .Case( [&](const VPRecipeBase *R) { return VerifyEVLUse(*R, 1); }) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll index a28673cf8e552..94ccbd91bdab1 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll @@ -39,23 +39,20 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] ; RV64-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 ; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]] -; RV64-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP5]] -; RV64-NEXT: [[TMP11:%.*]] = sub i64 1, [[TMP5]] -; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP10]] -; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP11]] -; RV64-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP13]], align 4 -; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD]]) -; RV64-NEXT: [[TMP14:%.*]] = add [[REVERSE]], splat (i32 1) -; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] -; RV64-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP5]] -; RV64-NEXT: [[TMP17:%.*]] = sub i64 1, [[TMP5]] -; RV64-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[TMP16]] -; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[TMP17]] -; RV64-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP14]]) -; RV64-NEXT: store [[REVERSE1]], ptr [[TMP19]], align 4 +; RV64-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0 +; RV64-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP5]] to i32 +; RV64-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP10]], i64 -4, splat (i1 true), i32 [[TMP11]]) +; RV64-NEXT: [[TMP12:%.*]] = add [[WIDE_STRIDED_LOAD]], splat (i32 1) +; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] +; RV64-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP5]] +; RV64-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP5]] +; RV64-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP14]] +; RV64-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[TMP15]] +; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP12]]) +; RV64-NEXT: store [[REVERSE]], ptr [[TMP17]], align 4 ; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; RV64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; RV64-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV64-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; RV64: [[MIDDLE_BLOCK]]: ; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] ; RV64-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -66,8 +63,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV64-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; RV64-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 ; RV64-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]] -; RV64-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4 -; RV64-NEXT: [[ADD:%.*]] = add i32 [[TMP21]], 1 +; RV64-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4 +; RV64-NEXT: [[ADD:%.*]] = add i32 [[TMP19]], 1 ; RV64-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]] ; RV64-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4 ; RV64-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 @@ -96,25 +93,21 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] ; RV32-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 ; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]] -; RV32-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32 -; RV32-NEXT: [[TMP11:%.*]] = mul i32 0, [[TMP10]] -; RV32-NEXT: [[TMP12:%.*]] = sub i32 1, [[TMP10]] -; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 [[TMP11]] -; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 [[TMP12]] -; RV32-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 4 -; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD]]) -; RV32-NEXT: [[TMP15:%.*]] = add [[REVERSE]], splat (i32 1) -; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] -; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32 -; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]] -; RV32-NEXT: [[TMP19:%.*]] = sub i32 1, [[TMP17]] -; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 [[TMP18]] -; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 [[TMP19]] -; RV32-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP15]]) -; RV32-NEXT: store [[REVERSE1]], ptr [[TMP21]], align 4 +; RV32-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0 +; RV32-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP5]] to i32 +; RV32-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr align 4 [[TMP10]], i32 -4, splat (i1 true), i32 [[TMP11]]) +; RV32-NEXT: [[TMP12:%.*]] = add [[WIDE_STRIDED_LOAD]], splat (i32 1) +; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] +; RV32-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP5]] to i32 +; RV32-NEXT: [[TMP15:%.*]] = mul i32 0, [[TMP14]] +; RV32-NEXT: [[TMP16:%.*]] = sub i32 1, [[TMP14]] +; RV32-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 [[TMP15]] +; RV32-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 [[TMP16]] +; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP12]]) +; RV32-NEXT: store [[REVERSE]], ptr [[TMP18]], align 4 ; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; RV32-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV32-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; RV32-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV32-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; RV32: [[MIDDLE_BLOCK]]: ; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] ; RV32-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -125,8 +118,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV32-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; RV32-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 ; RV32-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]] -; RV32-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4 -; RV32-NEXT: [[ADD:%.*]] = add i32 [[TMP23]], 1 +; RV32-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4 +; RV32-NEXT: [[ADD:%.*]] = add i32 [[TMP20]], 1 ; RV32-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]] ; RV32-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4 ; RV32-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 @@ -154,38 +147,35 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV64-UF2: [[VECTOR_BODY]]: ; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] -; RV64-UF2-NEXT: [[TMP9:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 -; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP9]] -; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 0, [[TMP5]] -; RV64-UF2-NEXT: [[TMP12:%.*]] = sub i64 1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP11]] -; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP12]] -; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP16:%.*]] = sub i64 1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP15]] -; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP16]] -; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 4 -; RV64-UF2-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD]]) -; RV64-UF2-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP18]], align 4 -; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD1]]) -; RV64-UF2-NEXT: [[TMP19:%.*]] = add [[REVERSE]], splat (i32 1) -; RV64-UF2-NEXT: [[TMP20:%.*]] = add [[REVERSE2]], splat (i32 1) -; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]] -; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP5]] -; RV64-UF2-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP22]] -; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]] -; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP27:%.*]] = sub i64 1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP26]] -; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i64 [[TMP27]] -; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP19]]) -; RV64-UF2-NEXT: store [[REVERSE3]], ptr [[TMP25]], align 4 -; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP20]]) -; RV64-UF2-NEXT: store [[REVERSE4]], ptr [[TMP29]], align 4 +; RV64-UF2-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 +; RV64-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]] +; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0 +; RV64-UF2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4 +; RV64-UF2-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], -1 +; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP13]] +; RV64-UF2-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP5]] to i32 +; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP10]], i64 -4, splat (i1 true), i32 [[TMP15]]) +; RV64-UF2-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP5]] to i32 +; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD1:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP14]], i64 -4, splat (i1 true), i32 [[TMP16]]) +; RV64-UF2-NEXT: [[TMP17:%.*]] = add [[WIDE_STRIDED_LOAD]], splat (i32 1) +; RV64-UF2-NEXT: [[TMP18:%.*]] = add [[WIDE_STRIDED_LOAD1]], splat (i32 1) +; RV64-UF2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]] +; RV64-UF2-NEXT: [[TMP20:%.*]] = mul i64 0, [[TMP5]] +; RV64-UF2-NEXT: [[TMP21:%.*]] = sub i64 1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP20]] +; RV64-UF2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i64 [[TMP21]] +; RV64-UF2-NEXT: [[TMP24:%.*]] = mul i64 -1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP25:%.*]] = sub i64 1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP24]] +; RV64-UF2-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i64 [[TMP25]] +; RV64-UF2-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP17]]) +; RV64-UF2-NEXT: store [[REVERSE]], ptr [[TMP23]], align 4 +; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP18]]) +; RV64-UF2-NEXT: store [[REVERSE2]], ptr [[TMP27]], align 4 ; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; RV64-UF2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-UF2-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; RV64-UF2-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV64-UF2-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; RV64-UF2: [[MIDDLE_BLOCK]]: ; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] ; RV64-UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -196,8 +186,8 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV64-UF2-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; RV64-UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 ; RV64-UF2-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]] -; RV64-UF2-NEXT: [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4 -; RV64-UF2-NEXT: [[ADD:%.*]] = add i32 [[TMP31]], 1 +; RV64-UF2-NEXT: [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4 +; RV64-UF2-NEXT: [[ADD:%.*]] = add i32 [[TMP29]], 1 ; RV64-UF2-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]] ; RV64-UF2-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4 ; RV64-UF2-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 @@ -245,23 +235,20 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { ; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] ; RV64-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 ; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]] -; RV64-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP5]] -; RV64-NEXT: [[TMP11:%.*]] = sub i64 1, [[TMP5]] -; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP10]] -; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP11]] -; RV64-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP13]], align 4 -; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD]]) -; RV64-NEXT: [[TMP14:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) -; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] -; RV64-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP5]] -; RV64-NEXT: [[TMP17:%.*]] = sub i64 1, [[TMP5]] -; RV64-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i64 [[TMP16]] -; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i64 [[TMP17]] -; RV64-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP14]]) -; RV64-NEXT: store [[REVERSE1]], ptr [[TMP19]], align 4 +; RV64-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0 +; RV64-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP5]] to i32 +; RV64-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP10]], i64 -4, splat (i1 true), i32 [[TMP11]]) +; RV64-NEXT: [[TMP12:%.*]] = fadd [[WIDE_STRIDED_LOAD]], splat (float 1.000000e+00) +; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] +; RV64-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP5]] +; RV64-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP5]] +; RV64-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[TMP14]] +; RV64-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP15]] +; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP12]]) +; RV64-NEXT: store [[REVERSE]], ptr [[TMP17]], align 4 ; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; RV64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; RV64-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV64-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; RV64: [[MIDDLE_BLOCK]]: ; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] ; RV64-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -272,8 +259,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { ; RV64-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; RV64-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 ; RV64-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]] -; RV64-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4 -; RV64-NEXT: [[FADD:%.*]] = fadd float [[TMP21]], 1.000000e+00 +; RV64-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4 +; RV64-NEXT: [[FADD:%.*]] = fadd float [[TMP19]], 1.000000e+00 ; RV64-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]] ; RV64-NEXT: store float [[FADD]], ptr [[ARRAYIDX_A]], align 4 ; RV64-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 @@ -302,25 +289,21 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { ; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] ; RV32-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 ; RV32-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]] -; RV32-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32 -; RV32-NEXT: [[TMP11:%.*]] = mul i32 0, [[TMP10]] -; RV32-NEXT: [[TMP12:%.*]] = sub i32 1, [[TMP10]] -; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 [[TMP11]] -; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP12]] -; RV32-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 4 -; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD]]) -; RV32-NEXT: [[TMP15:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) -; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] -; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32 -; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]] -; RV32-NEXT: [[TMP19:%.*]] = sub i32 1, [[TMP17]] -; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 [[TMP18]] -; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP19]] -; RV32-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP15]]) -; RV32-NEXT: store [[REVERSE1]], ptr [[TMP21]], align 4 +; RV32-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0 +; RV32-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP5]] to i32 +; RV32-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4f32.p0.i32(ptr align 4 [[TMP10]], i32 -4, splat (i1 true), i32 [[TMP11]]) +; RV32-NEXT: [[TMP12:%.*]] = fadd [[WIDE_STRIDED_LOAD]], splat (float 1.000000e+00) +; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] +; RV32-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP5]] to i32 +; RV32-NEXT: [[TMP15:%.*]] = mul i32 0, [[TMP14]] +; RV32-NEXT: [[TMP16:%.*]] = sub i32 1, [[TMP14]] +; RV32-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP15]] +; RV32-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 [[TMP16]] +; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP12]]) +; RV32-NEXT: store [[REVERSE]], ptr [[TMP18]], align 4 ; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; RV32-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV32-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; RV32-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV32-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; RV32: [[MIDDLE_BLOCK]]: ; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] ; RV32-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -331,8 +314,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { ; RV32-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; RV32-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 ; RV32-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]] -; RV32-NEXT: [[TMP23:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4 -; RV32-NEXT: [[FADD:%.*]] = fadd float [[TMP23]], 1.000000e+00 +; RV32-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4 +; RV32-NEXT: [[FADD:%.*]] = fadd float [[TMP20]], 1.000000e+00 ; RV32-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]] ; RV32-NEXT: store float [[FADD]], ptr [[ARRAYIDX_A]], align 4 ; RV32-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 @@ -360,38 +343,35 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { ; RV64-UF2: [[VECTOR_BODY]]: ; RV64-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; RV64-UF2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] -; RV64-UF2-NEXT: [[TMP9:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 -; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]] -; RV64-UF2-NEXT: [[TMP11:%.*]] = mul i64 0, [[TMP5]] -; RV64-UF2-NEXT: [[TMP12:%.*]] = sub i64 1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP11]] -; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[TMP12]] -; RV64-UF2-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP16:%.*]] = sub i64 1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP15]] -; RV64-UF2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP16]] -; RV64-UF2-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 4 -; RV64-UF2-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD]]) -; RV64-UF2-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP18]], align 4 -; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD1]]) -; RV64-UF2-NEXT: [[TMP19:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) -; RV64-UF2-NEXT: [[TMP20:%.*]] = fadd [[REVERSE2]], splat (float 1.000000e+00) -; RV64-UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] -; RV64-UF2-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP5]] -; RV64-UF2-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP22]] -; RV64-UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i64 [[TMP23]] -; RV64-UF2-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP27:%.*]] = sub i64 1, [[TMP5]] -; RV64-UF2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP26]] -; RV64-UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[TMP27]] -; RV64-UF2-NEXT: [[REVERSE3:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP19]]) -; RV64-UF2-NEXT: store [[REVERSE3]], ptr [[TMP25]], align 4 -; RV64-UF2-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP20]]) -; RV64-UF2-NEXT: store [[REVERSE4]], ptr [[TMP29]], align 4 +; RV64-UF2-NEXT: [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 +; RV64-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]] +; RV64-UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0 +; RV64-UF2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; RV64-UF2-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4 +; RV64-UF2-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], -1 +; RV64-UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP13]] +; RV64-UF2-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP5]] to i32 +; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP10]], i64 -4, splat (i1 true), i32 [[TMP15]]) +; RV64-UF2-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP5]] to i32 +; RV64-UF2-NEXT: [[WIDE_STRIDED_LOAD1:%.*]] = call @llvm.experimental.vp.strided.load.nxv4f32.p0.i64(ptr align 4 [[TMP14]], i64 -4, splat (i1 true), i32 [[TMP16]]) +; RV64-UF2-NEXT: [[TMP17:%.*]] = fadd [[WIDE_STRIDED_LOAD]], splat (float 1.000000e+00) +; RV64-UF2-NEXT: [[TMP18:%.*]] = fadd [[WIDE_STRIDED_LOAD1]], splat (float 1.000000e+00) +; RV64-UF2-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] +; RV64-UF2-NEXT: [[TMP20:%.*]] = mul i64 0, [[TMP5]] +; RV64-UF2-NEXT: [[TMP21:%.*]] = sub i64 1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[TMP20]] +; RV64-UF2-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP21]] +; RV64-UF2-NEXT: [[TMP24:%.*]] = mul i64 -1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP25:%.*]] = sub i64 1, [[TMP5]] +; RV64-UF2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[TMP24]] +; RV64-UF2-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[TMP25]] +; RV64-UF2-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP17]]) +; RV64-UF2-NEXT: store [[REVERSE]], ptr [[TMP23]], align 4 +; RV64-UF2-NEXT: [[REVERSE2:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP18]]) +; RV64-UF2-NEXT: store [[REVERSE2]], ptr [[TMP27]], align 4 ; RV64-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; RV64-UF2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-UF2-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; RV64-UF2-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV64-UF2-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; RV64-UF2: [[MIDDLE_BLOCK]]: ; RV64-UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] ; RV64-UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -402,8 +382,8 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) { ; RV64-UF2-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; RV64-UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1 ; RV64-UF2-NEXT: [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]] -; RV64-UF2-NEXT: [[TMP31:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4 -; RV64-UF2-NEXT: [[FADD:%.*]] = fadd float [[TMP31]], 1.000000e+00 +; RV64-UF2-NEXT: [[TMP29:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4 +; RV64-UF2-NEXT: [[FADD:%.*]] = fadd float [[TMP29]], 1.000000e+00 ; RV64-UF2-NEXT: [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]] ; RV64-UF2-NEXT: store float [[FADD]], ptr [[ARRAYIDX_A]], align 4 ; RV64-UF2-NEXT: [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index 9e77a0ca8bcc9..7382fffdbb0ed 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -74,15 +74,15 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1> -; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1> -; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> -; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, vp<[[VF]]> -; CHECK-NEXT: WIDEN ir<%1> = load vp<[[VEC_PTR]]> -; CHECK-NEXT: WIDEN ir<%add9> = add ir<%1>, ir<1> -; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> -; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, vp<[[VF]]> -; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add9> +; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> +; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> +; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = vp<[[VF]]> +; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1> +; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, vp<[[VF]]> +; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors @@ -154,7 +154,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK: Executing best plan with VF=vscale x 4, UF=1 ; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF={1}' { ; CHECK-NEXT: Live-in ir<[[VF:%.+]]> = VF -; CHECK-NEXT: Live-in ir<[[VFxUF:%.+]]>.1 = VF * UF +; CHECK-NEXT: Live-in ir<[[VF]]>.1 = VF * UF ; CHECK-NEXT: Live-in ir<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: ir<%0> = original trip-count ; CHECK-EMPTY: @@ -199,16 +199,16 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ] ; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> -; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[DEV_IV]]>, ir<-1> -; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> -; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, ir<[[VF]]> -; CHECK-NEXT: WIDEN ir<[[L:%.+]]> = load vp<[[VEC_PTR]]> -; CHECK-NEXT: WIDEN ir<%add9> = add ir<[[L]]>, ir<1> -; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> -; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, ir<[[VF]]> -; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add9> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>.1 +; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[DEV_IV]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> +; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> +; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = ir<[[VF]]> +; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = add ir<[[LD]]>, ir<1> +; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, ir<[[VF]]> +; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VF]]>.1 ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } @@ -325,15 +325,15 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1> -; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1> -; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> -; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, vp<[[VF]]> -; CHECK-NEXT: WIDEN ir<%1> = load vp<[[VEC_PTR]]> -; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<%1>, ir<1.000000e+00> -; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> -; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, vp<[[VF]]> -; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%conv1> +; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[STEPS]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> +; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> +; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = vp<[[VF]]> +; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = fadd ir<[[LD]]>, ir<1.000000e+00> +; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, vp<[[VF]]> +; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors @@ -405,7 +405,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK: Executing best plan with VF=vscale x 4, UF=1 ; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF={1}' { ; CHECK-NEXT: Live-in ir<[[VF:%.+]]> = VF -; CHECK-NEXT: Live-in ir<[[VFxUF:%.+]]>.1 = VF * UF +; CHECK-NEXT: Live-in ir<[[VF]]>.1 = VF * UF ; CHECK-NEXT: Live-in ir<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: ir<%0> = original trip-count ; CHECK-EMPTY: @@ -450,16 +450,16 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ] ; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> -; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[DEV_IV]]>, ir<-1> -; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> -; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, ir<[[VF]]> -; CHECK-NEXT: WIDEN ir<[[L:%.+]]> = load vp<[[VEC_PTR]]> -; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<[[L]]>, ir<1.000000e+00> -; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> -; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, ir<[[VF]]> -; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%conv1> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>.1 +; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[DEV_IV]]>, ir<-1> +; CHECK-NEXT: CLONE ir<[[ZEXT_IDX:%.+]]> = zext ir<[[IDX]]> +; CHECK-NEXT: CLONE ir<[[LD_IDX:%.+]]> = getelementptr inbounds ir<%B>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[LD_PTR:%.+]]> = vector-pointer ir<[[LD_IDX]]> +; CHECK-NEXT: WIDEN ir<[[LD:%.+]]> = load vp<[[LD_PTR]]>, stride = ir<-4>, runtimeVF = ir<[[VF]]> +; CHECK-NEXT: WIDEN ir<[[ADD:%.+]]> = fadd ir<[[LD]]>, ir<1.000000e+00> +; CHECK-NEXT: CLONE ir<[[ST_IDX:%.+]]> = getelementptr inbounds ir<%A>, ir<[[ZEXT_IDX]]> +; CHECK-NEXT: vp<[[ST_PTR:%.+]]> = vector-end-pointer inbounds ir<[[ST_IDX]]>, ir<[[VF]]> +; CHECK-NEXT: WIDEN store vp<[[ST_PTR]]>, ir<[[ADD]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VF]]>.1 ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll index 7bdb67b8a0fba..251cf7c01c976 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll @@ -29,37 +29,32 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt ; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 1024, [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL:%.*]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], -1 -; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP7]] -; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 0, [[TMP18]] -; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 1, [[TMP18]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP9]] -; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP10]] -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]] -; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP19]] -; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP19]] -; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]] -; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]] -; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE3]], ptr align 4 [[TMP17]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], -1 +; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP6]] +; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; IF-EVL-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP8]], i64 -4, splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP6]] +; IF-EVL-NEXT: [[TMP10:%.*]] = zext i32 [[TMP5]] to i64 +; IF-EVL-NEXT: [[TMP11:%.*]] = mul i64 0, [[TMP10]] +; IF-EVL-NEXT: [[TMP12:%.*]] = sub i64 1, [[TMP10]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP9]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP12]] +; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[WIDE_STRIDED_LOAD]], splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE]], ptr align 4 [[TMP14]], splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: [[TMP15:%.*]] = zext i32 [[TMP5]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] -; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br label [[LOOPEND:%.*]] ; IF-EVL: scalar.ph: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[STARTVAL]], [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ 0, [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[ADD_PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ADD]] = add i64 [[ADD_PHI]], -1 ; IF-EVL-NEXT: [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[ADD]] ; IF-EVL-NEXT: [[TMP:%.*]] = load i32, ptr [[GEPL]], align 4 @@ -127,35 +122,29 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal ; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 1024, [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL:%.*]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[OFFSET_IDX3:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32 -; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], -1 -; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[OFFSET_IDX3]] -; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[TMP14:%.*]] = icmp slt [[VP_OP_LOAD]], splat (i32 100) -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP26:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP26]] -; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP26]] -; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP17]] -; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP18]] -; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP14]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], [[VP_REVERSE_MASK]], i32 [[TMP5]]) -; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD4]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP27]] -; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP27]] -; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP22]] -; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]] -; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP14]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE5]], ptr align 4 [[TMP25]], [[VP_REVERSE_MASK6]], i32 [[TMP5]]) -; IF-EVL-NEXT: [[TMP28:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP28]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[OFFSET_IDX1:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32 +; IF-EVL-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], -1 +; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[OFFSET_IDX1]] +; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP8]], splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: [[TMP9:%.*]] = icmp slt [[VP_OP_LOAD]], splat (i32 100) +; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[PTR1:%.*]], i64 [[TMP6]] +; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0 +; IF-EVL-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP11]], i64 -4, [[TMP9]], i32 [[TMP5]]) +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP6]] +; IF-EVL-NEXT: [[TMP13:%.*]] = zext i32 [[TMP5]] to i64 +; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP13]] +; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP13]] +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP12]], i64 [[TMP14]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP15]] +; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[WIDE_STRIDED_LOAD]], splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP9]], splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE]], ptr align 4 [[TMP17]], [[VP_REVERSE_MASK]], i32 [[TMP5]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP5]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP18]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] -; IF-EVL-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br label [[LOOPEND:%.*]] ; IF-EVL: scalar.ph: