diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt index f33906b05fedd..8bd3dbf069573 100644 --- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt +++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt @@ -16,6 +16,7 @@ add_llvm_component_library(LLVMVectorize VPlanRecipes.cpp VPlanSLP.cpp VPlanTransforms.cpp + VPlanUnroll.cpp VPlanVerifier.cpp VPlanUtils.cpp diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 2fe9af6b0d14f..034fdf4233de3 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -156,6 +156,15 @@ class VPBuilder { DebugLoc DL, const Twine &Name = "") { return createInstruction(Opcode, Operands, DL, Name); } + VPInstruction *createNaryOp(unsigned Opcode, + std::initializer_list Operands, + std::optional FMFs = {}, + DebugLoc DL = {}, const Twine &Name = "") { + if (FMFs) + return tryInsertInstruction( + new VPInstruction(Opcode, Operands, *FMFs, DL, Name)); + return createInstruction(Opcode, Operands, DL, Name); + } VPInstruction *createOverflowingOp(unsigned Opcode, std::initializer_list Operands, @@ -164,6 +173,7 @@ class VPBuilder { return tryInsertInstruction( new VPInstruction(Opcode, Operands, WrapFlags, DL, Name)); } + VPValue *createNot(VPValue *Operand, DebugLoc DL = {}, const Twine &Name = "") { return createInstruction(VPInstruction::Not, {Operand}, DL, Name); @@ -223,6 +233,11 @@ class VPBuilder { return tryInsertInstruction(new VPScalarCastRecipe(Opcode, Op, ResultTy)); } + VPWidenCastRecipe *createWidenCast(Instruction::CastOps Opcode, VPValue *Op, + Type *ResultTy) { + return tryInsertInstruction(new VPWidenCastRecipe(Opcode, Op, ResultTy)); + } + VPScalarIVStepsRecipe * createScalarIVSteps(Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, VPValue *IV, VPValue *Step) { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 3cf41628ba0c7..b5c8763b4f66a 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7507,6 +7507,10 @@ LoopVectorizationPlanner::executePlan( "expanded SCEVs to reuse can only be used during epilogue vectorization"); (void)IsEpilogueVectorization; + // TODO: Move to VPlan transform stage once the transition to the VPlan-based + // cost model is complete for better cost estimates. + VPlanTransforms::unrollByUF(BestVPlan, BestUF, + OrigLoop->getHeader()->getModule()->getContext()); VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF @@ -7625,7 +7629,7 @@ LoopVectorizationPlanner::executePlan( if (MiddleTerm->isConditional() && hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { // Assume that `Count % VectorTripCount` is equally distributed. - unsigned TripCount = State.UF * State.VF.getKnownMinValue(); + unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue(); assert(TripCount > 0 && "trip count should not be zero"); const uint32_t Weights[] = {1, TripCount - 1}; setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index eb6f7e95fc223..b0a637012037d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -391,6 +391,7 @@ void VPTransformState::setDebugLocFrom(DebugLoc DL) { ->shouldEmitDebugInfoForProfiling() && !EnableFSDiscriminator) { // FIXME: For scalable vectors, assume vscale=1. + unsigned UF = Plan->getUF(); auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); if (NewDIL) @@ -1018,6 +1019,10 @@ static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) { /// Assumes a single pre-header basic-block was created for this. Introduce /// additional basic-blocks as needed, and fill them all. void VPlan::execute(VPTransformState *State) { + // Set UF to 1, as the unrollByUF VPlan transform already explicitly unrolled + // the VPlan. + // TODO: Remove State::UF and all uses. + State->UF = 1; // Initialize CFG state. State->CFG.PrevVPBB = nullptr; State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor(); @@ -1093,6 +1098,10 @@ void VPlan::execute(VPTransformState *State) { // consistent placement of all induction updates. Instruction *Inc = cast(Phi->getIncomingValue(1)); Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode()); + + // Use the steps for the last part as backedge value for the induction. + if (auto *IV = dyn_cast(&R)) + Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand(), 0)); continue; } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 116d731d1f02a..a29d972ca1d16 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -532,6 +532,7 @@ class VPBlockBase { VPBlocksTy &getSuccessors() { return Successors; } iterator_range successors() { return Successors; } + iterator_range predecessors() { return Predecessors; } const VPBlocksTy &getPredecessors() const { return Predecessors; } VPBlocksTy &getPredecessors() { return Predecessors; } @@ -724,6 +725,11 @@ class VPLiveOut : public VPUser { PHINode *getPhi() const { return Phi; } + /// Live-outs are marked as only using the first part during the transition + /// to unrolling directly on VPlan. + /// TODO: Remove after unroller transition. + bool onlyFirstPartUsed(const VPValue *Op) const override { return true; } + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the VPLiveOut to \p O. void print(raw_ostream &O, VPSlotTracker &SlotTracker) const; @@ -1226,11 +1232,24 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe { #endif }; +/// Helper to access the operand that contains the unroll part for this recipe +/// after unrolling. +template class VPUnrollPartAccessor { +protected: + /// Return the VPValue operand containing the unroll part or null if there is + /// no such operand. + VPValue *getUnrollPartOperand(VPUser &U) const; + + /// Return the unroll part. + unsigned getUnrollPart(VPUser &U) const; +}; + /// This is a concrete Recipe that models a single VPlan-level instruction. /// While as any Recipe it may generate a sequence of IR instructions when /// executed, these instructions would always form a single-def expression as /// the VPInstruction is also a single def-use vertex. -class VPInstruction : public VPRecipeWithIRFlags { +class VPInstruction : public VPRecipeWithIRFlags, + public VPUnrollPartAccessor<1> { friend class VPlanSlp; public: @@ -1764,7 +1783,8 @@ class VPWidenGEPRecipe : public VPRecipeWithIRFlags { /// A recipe to compute the pointers for widened memory accesses of IndexTy for /// all parts. If IsReverse is true, compute pointers for accessing the input in /// reverse order per part. -class VPVectorPointerRecipe : public VPRecipeWithIRFlags { +class VPVectorPointerRecipe : public VPRecipeWithIRFlags, + public VPUnrollPartAccessor<1> { Type *IndexedTy; bool IsReverse; @@ -1789,7 +1809,7 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags { bool onlyFirstPartUsed(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); - assert(getNumOperands() == 1 && "must have a single operand"); + assert(getNumOperands() <= 2 && "must have at most two operands"); return true; } @@ -1948,6 +1968,12 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe { VPValue *getVFValue() { return getOperand(2); } const VPValue *getVFValue() const { return getOperand(2); } + VPValue *getSplatVFValue() { + // If the recipe has been unrolled (4 operands), return the VPValue for the + // induction increment. + return getNumOperands() == 5 ? getOperand(3) : nullptr; + } + /// Returns the first defined value as TruncInst, if it is one or nullptr /// otherwise. TruncInst *getTruncInst() { return Trunc; } @@ -1967,9 +1993,17 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe { Type *getScalarType() const { return Trunc ? Trunc->getType() : IV->getType(); } + + /// Returns the VPValue representing the value of this induction at + /// the last unrolled part, if it exists. Returns itself if unrolling did not + /// take place. + VPValue *getLastUnrolledPartOperand() { + return getNumOperands() == 5 ? getOperand(4) : this; + } }; -class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe { +class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe, + public VPUnrollPartAccessor<3> { const InductionDescriptor &IndDesc; bool IsScalarAfterVectorization; @@ -2006,6 +2040,13 @@ class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe { /// Returns the induction descriptor for the recipe. const InductionDescriptor &getInductionDescriptor() const { return IndDesc; } + /// Returns the VPValue representing the value of this induction at + /// the first unrolled part, if it exists. Returns itself if unrolling did not + /// take place. + VPValue *getFirstUnrolledPartOperand() { + return getUnrollPart(*this) == 0 ? this : getOperand(2); + } + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, @@ -2088,7 +2129,8 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe { /// A recipe for handling reduction phis. The start value is the first operand /// of the recipe and the incoming value from the backedge is the second /// operand. -class VPReductionPHIRecipe : public VPHeaderPHIRecipe { +class VPReductionPHIRecipe : public VPHeaderPHIRecipe, + public VPUnrollPartAccessor<2> { /// Descriptor for the reduction. const RecurrenceDescriptor &RdxDesc; @@ -2907,7 +2949,10 @@ class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe { ~VPActiveLaneMaskPHIRecipe() override = default; VPActiveLaneMaskPHIRecipe *clone() override { - return new VPActiveLaneMaskPHIRecipe(getOperand(0), getDebugLoc()); + auto *R = new VPActiveLaneMaskPHIRecipe(getOperand(0), getDebugLoc()); + if (getNumOperands() == 2) + R->addOperand(getOperand(1)); + return R; } VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskPHISC) @@ -2966,7 +3011,8 @@ class VPEVLBasedIVPHIRecipe : public VPHeaderPHIRecipe { }; /// A Recipe for widening the canonical induction variable of the vector loop. -class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe { +class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe, + public VPUnrollPartAccessor<1> { public: VPWidenCanonicalIVRecipe(VPCanonicalIVPHIRecipe *CanonicalIV) : VPSingleDefRecipe(VPDef::VPWidenCanonicalIVSC, {CanonicalIV}) {} @@ -3052,7 +3098,8 @@ class VPDerivedIVRecipe : public VPSingleDefRecipe { /// A recipe for handling phi nodes of integer and floating-point inductions, /// producing their scalar values. -class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags { +class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags, + public VPUnrollPartAccessor<2> { Instruction::BinaryOps InductionOpcode; public: @@ -3548,6 +3595,11 @@ class VPlan { bool hasUF(unsigned UF) const { return UFs.empty() || UFs.contains(UF); } + unsigned getUF() const { + assert(UFs.size() == 1 && "Expected a single UF"); + return UFs[0]; + } + void setUF(unsigned UF) { assert(hasUF(UF) && "Cannot set the UF not already in plan"); UFs.clear(); @@ -3732,6 +3784,22 @@ class VPBlockUtils { connectBlocks(BlockPtr, NewBlock); } + /// Insert disconnected block \p NewBlock before \p Blockptr. First + /// disconnects all predecessors of \p BlockPtr and connects them to \p + /// NewBlock. Add \p NewBlock as predecessor of \p BlockPtr and \p BlockPtr as + /// successor of \p NewBlock. + static void insertBlockBefore(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) { + assert(NewBlock->getSuccessors().empty() && + NewBlock->getPredecessors().empty() && + "Can't insert new block with predecessors or successors."); + NewBlock->setParent(BlockPtr->getParent()); + for (VPBlockBase *Pred : to_vector(BlockPtr->predecessors())) { + disconnectBlocks(Pred, BlockPtr); + connectBlocks(Pred, NewBlock); + } + connectBlocks(NewBlock, BlockPtr); + } + /// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p /// BlockPtr. Add \p IfTrue and \p IfFalse as succesors of \p BlockPtr and \p /// BlockPtr as predecessor of \p IfTrue and \p IfFalse. Propagate \p BlockPtr diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 5f86f2c969651..4ddbd0d5fafb8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -144,6 +144,10 @@ struct UnaryRecipe_match { return DefR && match(DefR); } + bool match(const VPSingleDefRecipe *R) { + return match(static_cast(R)); + } + bool match(const VPRecipeBase *R) { if (!detail::MatchRecipeAndOpcode::match(R)) return false; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 9a2cfbc35cb84..255b8b03ae0db 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -332,6 +332,21 @@ FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const { return Res; } +template +VPValue * +VPUnrollPartAccessor::getUnrollPartOperand(VPUser &U) const { + if (U.getNumOperands() == PartOpIdx + 1) + return U.getOperand(PartOpIdx); + return nullptr; +} + +template +unsigned VPUnrollPartAccessor::getUnrollPart(VPUser &U) const { + if (auto *UnrollPartOp = getUnrollPartOperand(U)) + return cast(UnrollPartOp->getLiveInIRValue())->getZExtValue(); + return 0; +} + VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL, const Twine &Name) @@ -458,9 +473,9 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { if (Part != 0) return State.get(this, 0, /*IsScalar*/ true); + unsigned UF = getParent()->getPlan()->getUF(); Value *ScalarTC = State.get(getOperand(0), {0, 0}); - Value *Step = - createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF); + Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF); Value *Sub = Builder.CreateSub(ScalarTC, Step); Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step); Value *Zero = ConstantInt::get(ScalarTC->getType(), 0); @@ -492,12 +507,11 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { return EVL; } case VPInstruction::CanonicalIVIncrementForPart: { + unsigned Part = getUnrollPart(*this); auto *IV = State.get(getOperand(0), VPIteration(0, 0)); - if (Part == 0) - return IV; - - // The canonical IV is incremented by the vectorization factor (num of SIMD - // elements) times the unroll part. + assert(Part != 0 && "Must have a positive part"); + // The canonical IV is incremented by the vectorization factor (num of + // SIMD elements) times the unroll part. Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(), hasNoSignedWrap()); @@ -548,9 +562,6 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { return CondBr; } case VPInstruction::ComputeReductionResult: { - if (Part != 0) - return State.get(this, 0, /*IsScalar*/ true); - // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary // and will be removed by breaking up the recipe further. auto *PhiR = cast(getOperand(0)); @@ -560,11 +571,13 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { RecurKind RK = RdxDesc.getRecurrenceKind(); - VPValue *LoopExitingDef = getOperand(1); Type *PhiTy = OrigPhi->getType(); - VectorParts RdxParts(State.UF); - for (unsigned Part = 0; Part < State.UF; ++Part) - RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop()); + // The recipe's operands are the reduction phi, followed by one operand for + // each part of the reduction. + unsigned UF = getNumOperands() - 1; + VectorParts RdxParts(UF); + for (unsigned Part = 0; Part < UF; ++Part) + RdxParts[Part] = State.get(getOperand(1 + Part), 0, PhiR->isInLoop()); // If the vector reduction can be performed in a smaller type, we truncate // then extend the loop exit value to enable InstCombine to evaluate the @@ -572,7 +585,7 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { // TODO: Handle this in truncateToMinBW. if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF); - for (unsigned Part = 0; Part < State.UF; ++Part) + for (unsigned Part = 0; Part < UF; ++Part) RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); } // Reduce all of the unrolled parts into a single vector. @@ -582,12 +595,12 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { Op = Instruction::Or; if (PhiR->isOrdered()) { - ReducedPartRdx = RdxParts[State.UF - 1]; + ReducedPartRdx = RdxParts[UF - 1]; } else { // Floating-point operations should have some FMF to enable the reduction. IRBuilderBase::FastMathFlagGuard FMFG(Builder); Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); - for (unsigned Part = 1; Part < State.UF; ++Part) { + for (unsigned Part = 1; Part < UF; ++Part) { Value *RdxPart = RdxParts[Part]; if (Op != Instruction::ICmp && Op != Instruction::FCmp) ReducedPartRdx = Builder.CreateBinOp( @@ -1507,24 +1520,32 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { MulOp = Instruction::FMul; } - // Multiply the vectorization factor by the step using integer or - // floating-point arithmetic as appropriate. - Type *StepType = Step->getType(); - Value *RuntimeVF = State.get(getVFValue(), {0, 0}); - if (Step->getType()->isFloatingPointTy()) - RuntimeVF = Builder.CreateUIToFP(RuntimeVF, StepType); - else - RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, StepType); - Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); - - // Create a vector splat to use in the induction update. - // - // FIXME: If the step is non-constant, we create the vector splat with - // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't - // handle a constant vector splat. - Value *SplatVF = isa(Mul) - ? ConstantVector::getSplat(State.VF, cast(Mul)) - : Builder.CreateVectorSplat(State.VF, Mul); + Value *SplatVF; + if (VPValue *SplatVFOperand = getSplatVFValue()) { + // The recipe has been unrolled. In that case, fetch the splat value for the + // induction increment. + SplatVF = State.get(SplatVFOperand, 0); + } else { + // Multiply the vectorization factor by the step using integer or + // floating-point arithmetic as appropriate. + Type *StepType = Step->getType(); + Value *RuntimeVF = State.get(getVFValue(), {0, 0}); + if (Step->getType()->isFloatingPointTy()) + RuntimeVF = Builder.CreateUIToFP(RuntimeVF, StepType); + else + RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, StepType); + Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); + + // Create a vector splat to use in the induction update. + // + // FIXME: If the step is non-constant, we create the vector splat with + // IRBuilder. IRBuilder can constant-fold the multiply, but it + // doesn't handle a constant vector splat. + SplatVF = isa(Mul) + ? ConstantVector::getSplat(State.VF, cast(Mul)) + : Builder.CreateVectorSplat(State.VF, Mul); + } + Builder.restoreIP(CurrIP); // We may need to add the step a number of times, depending on the unroll @@ -1647,7 +1668,7 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) { } unsigned StartPart = 0; - unsigned EndPart = State.UF; + unsigned EndPart = 1; unsigned StartLane = 0; unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); if (State.Instance) { @@ -1656,8 +1677,10 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) { StartLane = State.Instance->Lane.getKnownLane(); EndLane = StartLane + 1; } + // TODO: Remove loop after VPlan-based unroller lands. for (unsigned Part = StartPart; Part < EndPart; ++Part) { - Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); + Value *StartIdx0 = + createStepForVF(Builder, IntStepTy, State.VF, getUnrollPart(*this)); if (!FirstLaneOnly && State.VF.isScalable()) { auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); @@ -1791,6 +1814,7 @@ void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, void VPVectorPointerRecipe ::execute(VPTransformState &State) { auto &Builder = State.Builder; State.setDebugLocFrom(getDebugLoc()); + unsigned CurrentPart = getUnrollPart(*this); for (unsigned Part = 0; Part < State.UF; ++Part) { // Calculate the pointer for the specific unroll-part. Value *PartPtr = nullptr; @@ -1798,7 +1822,7 @@ void VPVectorPointerRecipe ::execute(VPTransformState &State) { // or query DataLayout for a more suitable index type otherwise. const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout(); - Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0) + Type *IndexTy = State.VF.isScalable() && (IsReverse || CurrentPart > 0) ? DL.getIndexType(IndexedTy->getPointerTo()) : Builder.getInt32Ty(); Value *Ptr = State.get(getOperand(0), VPIteration(0, 0)); @@ -1809,16 +1833,17 @@ void VPVectorPointerRecipe ::execute(VPTransformState &State) { // RunTimeVF = VScale * VF.getKnownMinValue() // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF); - // NumElt = -Part * RunTimeVF + // NumElt = -CurrentPart * RunTimeVF Value *NumElt = Builder.CreateMul( - ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF); + ConstantInt::get(IndexTy, -(int64_t)CurrentPart), RunTimeVF); // LastLane = 1 - RunTimeVF Value *LastLane = Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds); PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds); } else { - Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part); + Value *Increment = + createStepForVF(Builder, IndexTy, State.VF, CurrentPart); PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds); } @@ -2894,42 +2919,58 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { auto *IVR = getParent()->getPlan()->getCanonicalIV(); PHINode *CanonicalIV = cast(State.get(IVR, 0, /*IsScalar*/ true)); - Type *PhiType = IndDesc.getStep()->getType(); + unsigned CurrentPart = getUnrollPart(*this); // Build a pointer phi Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); Type *ScStValueType = ScalarStartValue->getType(); - PHINode *NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi", - CanonicalIV->getIterator()); BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); - NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); + PHINode *NewPointerPhi = nullptr; + if (CurrentPart == 0) { + NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi", + CanonicalIV->getIterator()); + NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); + } else { + // The recipe has been unrolled. In that case, fetch the single pointer phi + // shared among all unrolled parts of the recipe. + auto *GEP = + cast(State.get(getFirstUnrolledPartOperand(), 0)); + NewPointerPhi = cast(GEP->getPointerOperand()); + } // A pointer induction, performed by using a gep BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint(); - Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0)); + Type *PhiType = IndDesc.getStep()->getType(); Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); - Value *NumUnrolledElems = - State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); - Value *InductionGEP = GetElementPtrInst::Create( - State.Builder.getInt8Ty(), NewPointerPhi, - State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", - InductionLoc); // Add induction update using an incorrect block temporarily. The phi node // will be fixed after VPlan execution. Note that at this point the latch // block cannot be used, as it does not exist yet. // TODO: Model increment value in VPlan, by turning the recipe into a // multi-def and a subclass of VPHeaderPHIRecipe. - NewPointerPhi->addIncoming(InductionGEP, VectorPH); + if (CurrentPart == 0) { + // The recipe represents the first part of the pointer induction. Create the + // GEP to increment the phi across all unrolled parts. + unsigned UF = CurrentPart == 0 ? getParent()->getPlan()->getUF() : 1; + Value *NumUnrolledElems = + State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, UF)); + + Value *InductionGEP = GetElementPtrInst::Create( + State.Builder.getInt8Ty(), NewPointerPhi, + State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", + InductionLoc); + + NewPointerPhi->addIncoming(InductionGEP, VectorPH); + } // Create UF many actual address geps that use the pointer // phi as base and a vectorized version of the step value // () as offset. for (unsigned Part = 0; Part < State.UF; ++Part) { Type *VecPhiType = VectorType::get(PhiType, State.VF); - Value *StartOffsetScalar = - State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); + Value *StartOffsetScalar = State.Builder.CreateMul( + RuntimeVF, ConstantInt::get(PhiType, CurrentPart)); Value *StartOffset = State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); // Create a vector of consecutive numbers from zero to VF. @@ -2950,11 +2991,19 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { + assert((getNumOperands() == 2 || getNumOperands() == 4) && + "unexpected number of operands"); O << Indent << "EMIT "; printAsOperand(O, SlotTracker); O << " = WIDEN-POINTER-INDUCTION "; getStartValue()->printAsOperand(O, SlotTracker); O << ", " << *IndDesc.getStep(); + if (getNumOperands() == 4) { + O << ", "; + getOperand(2)->printAsOperand(O, SlotTracker); + O << ", "; + getOperand(3)->printAsOperand(O, SlotTracker); + } } #endif @@ -2990,7 +3039,7 @@ void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { ? CanonicalIV : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { - Value *VStep = createStepForVF(Builder, STy, VF, Part); + Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this)); if (VF.isVector()) { VStep = Builder.CreateVectorSplat(VF, VStep); VStep = @@ -3079,6 +3128,8 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { Value *Iden = nullptr; RecurKind RK = RdxDesc.getRecurrenceKind(); + unsigned CurrentPart = getUnrollPart(*this); + if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { // MinMax and AnyOf reductions have the start value as their identity. @@ -3087,19 +3138,25 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { } else { IRBuilderBase::InsertPointGuard IPBuilder(Builder); Builder.SetInsertPoint(VectorPH->getTerminator()); - StartV = Iden = - Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); + StartV = Iden = State.get(StartVPV, 0); } } else { Iden = llvm::getRecurrenceIdentity(RK, VecTy->getScalarType(), RdxDesc.getFastMathFlags()); if (!ScalarPHI) { - Iden = Builder.CreateVectorSplat(State.VF, Iden); - IRBuilderBase::InsertPointGuard IPBuilder(Builder); - Builder.SetInsertPoint(VectorPH->getTerminator()); - Constant *Zero = Builder.getInt32(0); - StartV = Builder.CreateInsertElement(Iden, StartV, Zero); + if (CurrentPart == 0) { + // Create start and identity vector values for the reduction in the + // preheader. + // TODO: Introduce recipes in VPlan preheader to create initial values. + Iden = Builder.CreateVectorSplat(State.VF, Iden); + IRBuilderBase::InsertPointGuard IPBuilder(Builder); + Builder.SetInsertPoint(VectorPH->getTerminator()); + Constant *Zero = Builder.getInt32(0); + StartV = Builder.CreateInsertElement(Iden, StartV, Zero); + } else { + Iden = Builder.CreateVectorSplat(State.VF, Iden); + } } } @@ -3107,7 +3164,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { Value *EntryPart = State.get(this, Part, IsInLoop); // Make sure to add the reduction start value only to the // first unroll part. - Value *StartVal = (Part == 0) ? StartV : Iden; + Value *StartVal = (CurrentPart == 0) ? StartV : Iden; cast(EntryPart)->addIncoming(StartVal, VectorPH); } } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index fd6090affbbf1..8c76bd5c90a7b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -21,6 +21,7 @@ #include "VPlanUtils.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Analysis/IVDescriptors.h" diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 9d852a27a8ef6..761bce7c6bedf 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -49,6 +49,9 @@ struct VPlanTransforms { /// Clear NSW/NUW flags from reduction instructions if necessary. static void clearReductionWrapFlags(VPlan &Plan); + /// Explicitly unroll \p Plan by \p UF. + static void unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx); + /// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the /// resulting plan to \p BestVF and \p BestUF. static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp new file mode 100644 index 0000000000000..88b1748aeaf52 --- /dev/null +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -0,0 +1,475 @@ +//===-- VPlanUnroll.cpp - VPlan unroller ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements explicit unrolling for VPlans. +/// +//===----------------------------------------------------------------------===// + +#include "VPRecipeBuilder.h" +#include "VPlan.h" +#include "VPlanAnalysis.h" +#include "VPlanCFG.h" +#include "VPlanDominatorTree.h" +#include "VPlanPatternMatch.h" +#include "VPlanTransforms.h" +#include "VPlanUtils.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/ScopeExit.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/TypeSwitch.h" +#include "llvm/Analysis/IVDescriptors.h" +#include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/PatternMatch.h" + +using namespace llvm; + +namespace { + +/// Helper to hold state needed for unrolling. It holds the Plan to unroll by +/// UF. It also holds copies of VPValues across UF-1 unroll parts to facilitate +/// the unrolling transformation, where the original VPValues are retained for +/// part zero. +class UnrollState { + /// Plan to unroll. + VPlan &Plan; + /// Unroll factor to unroll by. + const unsigned UF; + /// Analysis for types. + VPTypeAnalysis TypeInfo; + + /// Unrolling may create recipes that should not be unrolled themselves. + /// Those are tracked in ToSkip. + SmallPtrSet ToSkip; + + // Associate with each VPValue of part 0 its unrolled instances of parts 1, + // ..., UF-1. + DenseMap> VPV2Parts; + + /// Unroll replicate region \p VPR by cloning the region UF - 1 times. + void unrollReplicateRegionByUF(VPRegionBlock *VPR); + + /// Unroll recipe \p R by cloning it UF - 1 times, unless it is uniform across + /// all parts. + void unrollRecipeByUF(VPRecipeBase &R); + + /// Unroll header phi recipe \p R. How exactly the recipe gets unrolled + /// depends on the concrete header phi. Inserts newly created recipes at \p + /// InsertPtForPhi. + void unrollHeaderPHIByUF(VPHeaderPHIRecipe *R, + VPBasicBlock::iterator InsertPtForPhi); + + /// Unroll a widen induction recipe \p IV. This introduces recipes to compute + /// the induction steps for each part. + void unrollWidenInductionByUF(VPWidenIntOrFpInductionRecipe *IV, + VPBasicBlock::iterator InsertPtForPhi); + + VPValue *getConstantVPV(unsigned Part) { + Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType(); + return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part)); + } + +public: + UnrollState(VPlan &Plan, unsigned UF, LLVMContext &Ctx) + : Plan(Plan), UF(UF), TypeInfo(Plan.getCanonicalIV()->getScalarType()) {} + + void unrollBlock(VPBlockBase *VPB); + + VPValue *getValueForPart(VPValue *V, unsigned Part) { + if (Part == 0 || V->isLiveIn()) + return V; + assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) && + "accessed value does not exist"); + return VPV2Parts[V][Part - 1]; + } + + /// Given a single original recipe \p OrigR (of part zero), and its copy \p + /// CopyR for part \p Part, map every VPValue defined by \p OrigR to its + /// corresponding VPValue defined by \p CopyR. + void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR, + unsigned Part) { + for (const auto &[Idx, VPV] : enumerate(OrigR->definedValues())) { + auto Ins = VPV2Parts.insert({VPV, {}}); + assert(Ins.first->second.size() == Part - 1 && "earlier parts not set"); + Ins.first->second.push_back(CopyR->getVPValue(Idx)); + } + } + + /// Given a uniform recipe \p R, add it for all parts. + void addUniformForAllParts(VPSingleDefRecipe *R) { + auto Ins = VPV2Parts.insert({R, {}}); + assert(Ins.second && "uniform value already added"); + for (unsigned Part = 0; Part != UF; ++Part) + Ins.first->second.push_back(R); + } + + bool contains(VPValue *VPV) const { return VPV2Parts.contains(VPV); } + + /// Update \p R's operand at \p OpIdx with its corresponding VPValue for part + /// \p P. + void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) { + auto *Op = R->getOperand(OpIdx); + R->setOperand(OpIdx, getValueForPart(Op, Part)); + } + + /// Update \p R's operands with their corresponding VPValues for part \p P. + void remapOperands(VPRecipeBase *R, unsigned Part) { + for (const auto &[OpIdx, Op] : enumerate(R->operands())) + R->setOperand(OpIdx, getValueForPart(Op, Part)); + } +}; +} // namespace + +void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) { + VPBlockBase *InsertPt = VPR->getSingleSuccessor(); + for (unsigned Part = 1; Part != UF; ++Part) { + auto *Copy = VPR->clone(); + VPBlockUtils::insertBlockBefore(Copy, InsertPt); + + auto PartI = vp_depth_first_shallow(Copy->getEntry()); + auto Part0 = vp_depth_first_shallow(VPR->getEntry()); + for (const auto &[PartIVPBB, Part0VPBB] : + zip(VPBlockUtils::blocksOnly(PartI), + VPBlockUtils::blocksOnly(Part0))) { + for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) { + remapOperands(&PartIR, Part); + if (auto *ScalarIVSteps = dyn_cast(&PartIR)) { + ScalarIVSteps->addOperand(getConstantVPV(Part)); + } + + addRecipeForPart(&Part0R, &PartIR, Part); + } + } + } +} + +void UnrollState::unrollWidenInductionByUF( + VPWidenIntOrFpInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi) { + VPBasicBlock *PH = cast( + IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor()); + Type *IVTy = TypeInfo.inferScalarType(IV); + auto &ID = IV->getInductionDescriptor(); + std::optional FMFs; + if (isa_and_present(ID.getInductionBinOp())) + FMFs = ID.getInductionBinOp()->getFastMathFlags(); + + VPValue *VectorStep = &Plan.getVF(); + VPBuilder Builder(PH); + if (TypeInfo.inferScalarType(VectorStep) != IVTy) { + Instruction::CastOps CastOp = + IVTy->isFloatingPointTy() ? Instruction::UIToFP : Instruction::Trunc; + VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy); + ToSkip.insert(VectorStep->getDefiningRecipe()); + } + + VPValue *ScalarStep = IV->getStepValue(); + auto *ConstStep = ScalarStep->isLiveIn() + ? dyn_cast(ScalarStep->getLiveInIRValue()) + : nullptr; + if (!ConstStep || ConstStep->getZExtValue() != 1) { + if (TypeInfo.inferScalarType(ScalarStep) != IVTy) { + ScalarStep = + Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy); + ToSkip.insert(ScalarStep->getDefiningRecipe()); + } + + unsigned MulOpc = + IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul; + VPInstruction *Mul = Builder.createNaryOp(MulOpc, {VectorStep, ScalarStep}, + FMFs, IV->getDebugLoc()); + VectorStep = Mul; + ToSkip.insert(Mul); + } + + // Now create recipes to compute the induction steps for part 1 .. UF. Part 0 + // remains the header phi. Parts > 0 are computed by adding Step to the + // previous part. The header phi recipe will get 2 new operands: the step + // value for a single part and the last part, used to compute the backedge + // value during VPWidenIntOrFpInductionRecipe::execute. %Part.0 = + // VPWidenIntOrFpInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3 + // %Part.1 = %Part.0 + %VectorStep + // %Part.2 = %Part.1 + %VectorStep + // %Part.3 = %Part.2 + %VectorStep + // + // The newly added recipes are added to ToSkip to avoid interleaving them + // again. + VPValue *Prev = IV; + Builder.setInsertPoint(IV->getParent(), InsertPtForPhi); + unsigned AddOpc = + IVTy->isFloatingPointTy() ? ID.getInductionOpcode() : Instruction::Add; + for (unsigned Part = 1; Part != UF; ++Part) { + std::string Name = + Part > 1 ? "step.add." + std::to_string(Part) : "step.add"; + + VPInstruction *Add = Builder.createNaryOp(AddOpc, + { + Prev, + VectorStep, + }, + FMFs, IV->getDebugLoc(), Name); + ToSkip.insert(Add); + addRecipeForPart(IV, Add, Part); + Prev = Add; + } + IV->addOperand(VectorStep); + IV->addOperand(Prev); +} + +void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R, + VPBasicBlock::iterator InsertPtForPhi) { + // First-order recurrences pass a single vector or scalar through their header + // phis, irrespective of interleaving. + if (isa(R)) + return; + + // Generate step vectors for each unrolled part. + if (auto *IV = dyn_cast(R)) { + unrollWidenInductionByUF(IV, InsertPtForPhi); + return; + } + + auto *RdxPhi = dyn_cast(R); + if (RdxPhi && RdxPhi->isOrdered()) + return; + + auto InsertPt = std::next(R->getIterator()); + for (unsigned Part = 1; Part != UF; ++Part) { + VPRecipeBase *Copy = R->clone(); + Copy->insertBefore(*R->getParent(), InsertPt); + addRecipeForPart(R, Copy, Part); + if (isa(R)) { + Copy->addOperand(R); + Copy->addOperand(getConstantVPV(Part)); + } else if (RdxPhi) { + Copy->addOperand(getConstantVPV(Part)); + } else { + assert(isa(R) && + "unexpected header phi recipe not needing unrolled part"); + } + } +} + +/// Handle non-header-phi recipes. +void UnrollState::unrollRecipeByUF(VPRecipeBase &R) { + using namespace llvm::VPlanPatternMatch; + if (match(&R, m_BranchOnCond(m_VPValue())) || + match(&R, m_BranchOnCount(m_VPValue(), m_VPValue()))) + return; + + if (auto *VPI = dyn_cast(&R)) { + VPValue *Op0, *Op1; + if (match(VPI, m_VPInstruction( + m_VPValue(Op0), m_VPValue(Op1)))) { + VPI->setOperand(1, getValueForPart(Op1, UF - 1)); + addUniformForAllParts(VPI); + if (Plan.hasScalarVFOnly()) { + // Extracting from end with VF = 1 implies retrieving the scalar part UF + // - Op1. + unsigned Offset = + cast(Op1->getLiveInIRValue())->getZExtValue(); + VPI->replaceAllUsesWith(getValueForPart(Op0, UF - Offset)); + } else { + // Otherwise we extract from the last part. + remapOperands(VPI, UF - 1); + } + return; + } + + if (vputils::onlyFirstPartUsed(VPI)) { + addUniformForAllParts(VPI); + return; + } + } + if (auto *RepR = dyn_cast(&R)) { + if (isa(RepR->getUnderlyingValue()) && + RepR->getOperand(1)->isDefinedOutsideLoopRegions()) { + // Stores to an invariant address only need to store the last part. + remapOperands(&R, UF - 1); + return; + } + if (auto *II = dyn_cast(RepR->getUnderlyingValue())) { + if (II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl) { + addUniformForAllParts(RepR); + return; + } + } + } + + // Unroll non-uniform recipes. + auto InsertPt = std::next(R.getIterator()); + VPBasicBlock &VPBB = *R.getParent(); + for (unsigned Part = 1; Part != UF; ++Part) { + VPRecipeBase *Copy = R.clone(); + Copy->insertBefore(VPBB, InsertPt); + addRecipeForPart(&R, Copy, Part); + + VPValue *Op; + if (match(&R, m_VPInstruction( + m_VPValue(), m_VPValue(Op)))) { + Copy->setOperand(0, getValueForPart(Op, Part - 1)); + Copy->setOperand(1, getValueForPart(Op, Part)); + continue; + } + if (auto *Red = dyn_cast(&R)) { + auto *Phi = cast(R.getOperand(0)); + if (Phi->isOrdered()) { + auto Ins = VPV2Parts.insert({Phi, {}}); + if (Part == 1) { + Ins.first->second.clear(); + Ins.first->second.push_back(Red); + } + Ins.first->second.push_back(Copy->getVPSingleValue()); + Phi->setOperand(1, Copy->getVPSingleValue()); + } + } + remapOperands(Copy, Part); + + // Add operand indicating the part to generate code for, to recipes still + // requiring it. + if (isa(Copy) || + match(Copy, m_VPInstruction( + m_VPValue()))) + Copy->addOperand(getConstantVPV(Part)); + + if (isa(R)) + Copy->setOperand(0, R.getOperand(0)); + } +} + +using namespace llvm::VPlanPatternMatch; +void UnrollState::unrollBlock(VPBlockBase *VPB) { + auto *VPR = dyn_cast(VPB); + if (VPR) { + if (VPR->isReplicator()) + return unrollReplicateRegionByUF(VPR); + + // Traverse blocks in region in RPO to ensure defs are visited before uses + // across blocks. + ReversePostOrderTraversal> + RPOT(VPR->getEntry()); + for (VPBlockBase *VPB : RPOT) + unrollBlock(VPB); + return; + } + + // VPB is a VPBasicBlock; unroll it, i.e., unroll its recipes. + auto *VPBB = cast(VPB); + auto InsertPtForPhi = VPBB->getFirstNonPhi(); + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + if (ToSkip.contains(&R) || isa(&R)) + continue; + + // Add all VPValues for all parts to ComputeReductionResult which combines + // the parts to compute the final reduction value. + VPValue *Op1; + if (match(&R, m_VPInstruction( + m_VPValue(), m_VPValue(Op1)))) { + addUniformForAllParts(cast(&R)); + for (unsigned Part = 1; Part != UF; ++Part) + R.addOperand(getValueForPart(Op1, Part)); + continue; + } + VPValue *Op0; + if (match(&R, m_VPInstruction( + m_VPValue(Op0), m_VPValue(Op1)))) { + addUniformForAllParts(cast(&R)); + if (Plan.hasScalarVFOnly()) { + // Extracting from end with VF = 1 implies retrieving the scalar part UF + // - Op1. + unsigned Offset = + cast(Op1->getLiveInIRValue())->getZExtValue(); + R.getVPSingleValue()->replaceAllUsesWith( + getValueForPart(Op0, UF - Offset)); + R.eraseFromParent(); + } else { + // Otherwise we extract from the last part. + remapOperands(&R, UF - 1); + } + continue; + } + + auto *SingleDef = dyn_cast(&R); + if (SingleDef && vputils::isUniformAcrossVFsAndUFs(SingleDef)) { + addUniformForAllParts(SingleDef); + continue; + } + + if (auto *H = dyn_cast(&R)) { + unrollHeaderPHIByUF(H, InsertPtForPhi); + continue; + } + + unrollRecipeByUF(R); + } +} + +void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) { + assert(UF > 0 && "Unroll factor must be positive"); + Plan.setUF(UF); + auto Cleanup = make_scope_exit([&Plan]() { + auto Iter = vp_depth_first_deep(Plan.getEntry()); + // Remove recipes that are redundant after unrolling. + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(Iter)) { + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + auto *VPI = dyn_cast(&R); + if (VPI && + VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart && + VPI->getNumOperands() == 1) { + VPI->replaceAllUsesWith(VPI->getOperand(0)); + VPI->eraseFromParent(); + } + } + } + }); + if (UF == 1) { + return; + } + + UnrollState Unroller(Plan, UF, Ctx); + + Unroller.unrollBlock(Plan.getPreheader()); + + // Iterate over all blocks in the plan starting from Entry, and unroll + // recipes inside them. This includes the vector preheader and middle blocks, + // which may set up or post-process per-part values. + ReversePostOrderTraversal> RPOT( + Plan.getEntry()); + for (VPBlockBase *VPB : RPOT) + Unroller.unrollBlock(VPB); + + unsigned Part = 1; + // Remap operands of cloned header phis to update backedge values. The header + // phis cloned during unrolling are just after the header phi for part 0. + // Reset Part to 1 when reaching the first (part 0) recipe of a block. + for (VPRecipeBase &H : + Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { + // The second operand of Fixed Order Recurrence phi's, feeding the spliced + // value across the backedge, needs to remap to the last part of the spliced + // value. + if (isa(&H)) { + Unroller.remapOperand(&H, 1, UF - 1); + continue; + } + if (Unroller.contains(H.getVPSingleValue()) || + isa(&H)) { + Part = 1; + continue; + } + Unroller.remapOperands(&H, Part); + Part++; + } + + // Remap the operand of live-outs to the last part. + for (const auto &[_, LO] : Plan.getLiveOuts()) { + VPValue *In = Unroller.getValueForPart(LO->getOperand(0), UF - 1); + LO->setOperand(0, In); + } +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 414f8866d24f0..4621c28b05129 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -72,3 +72,46 @@ const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) { [](const VPExpandSCEVRecipe *R) { return R->getSCEV(); }) .Default([&SE](const VPRecipeBase *) { return SE.getCouldNotCompute(); }); } + +bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) { + using namespace VPlanPatternMatch; + // Live-ins are uniform. + if (V->isLiveIn()) + return true; + + VPRecipeBase *R = V->getDefiningRecipe(); + if (R && V->isDefinedOutsideLoopRegions()) { + if (match(V->getDefiningRecipe(), + m_VPInstruction( + m_VPValue()))) + return false; + return all_of(R->operands(), + [](VPValue *Op) { return isUniformAcrossVFsAndUFs(Op); }); + } + + auto *CanonicalIV = R->getParent()->getPlan()->getCanonicalIV(); + // Canonical IV chain is uniform. + if (V == CanonicalIV || V == CanonicalIV->getBackedgeValue()) + return true; + + return TypeSwitch(R) + .Case([](const auto *R) { return true; }) + .Case([](const auto *R) { + // Loads and stores that are uniform across VF lanes are handled by + // VPReplicateRecipe.IsUniform. They are also uniform across UF parts if + // all their operands are invariant. + // TODO: Further relax the restrictions. + return R->isUniform() && + (isa(R->getUnderlyingValue())) && + all_of(R->operands(), + [](VPValue *Op) { return isUniformAcrossVFsAndUFs(Op); }); + }) + .Case([](const auto *R) { + // A cast is uniform according to its operand. + return isUniformAcrossVFsAndUFs(R->getOperand(0)); + }) + .Default([](const VPRecipeBase *) { // A value is considered non-uniform + // unless proven otherwise. + return false; + }); +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index cb7a4e443176a..9657770020521 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -54,6 +54,13 @@ inline bool isUniformAfterVectorization(const VPValue *VPV) { /// Return true if \p V is a header mask in \p Plan. bool isHeaderMask(const VPValue *V, VPlan &Plan); + +/// Checks if \p V is uniform across all VF lanes and UF parts. It is considered +/// as such if it is either loop invariant (defined outside the vector region) +/// or its operand is known to be uniform across all VFs and UFs (e.g. +/// VPDerivedIV or VPCanonicalIVPHI). +bool isUniformAcrossVFsAndUFs(VPValue *V); + } // end namespace llvm::vputils #endif diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll b/llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll index 22aaa563daa5a..b784c465f878e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll @@ -103,10 +103,10 @@ for.end: ; preds = %for.body ; CHECK-LABEL: @ptr_ind_plus2( ; CHECK: %[[V0:.*]] = load <8 x i32> -; CHECK: %[[V1:.*]] = load <8 x i32> ; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> poison, <4 x i32> -; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> poison, <4 x i32> ; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> poison, <4 x i32> +; CHECK: %[[V1:.*]] = load <8 x i32> +; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> poison, <4 x i32> ; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> poison, <4 x i32> ; CHECK: mul nsw <4 x i32> ; CHECK: mul nsw <4 x i32> diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll index dc5cb246039e1..0f33e8fa79ce7 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll @@ -162,22 +162,18 @@ define void @test_shrink_zext_in_preheader(ptr noalias %src, ptr noalias %dst, i ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[A]] to i16 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i16> poison, i16 [[TMP0]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i16> poison, i16 [[B]], i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[TMP3]], <16 x i16> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = mul <16 x i16> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = mul <16 x i16> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = lshr <16 x i16> [[TMP5]], -; CHECK-NEXT: [[TMP8:%.*]] = lshr <16 x i16> [[TMP6]], -; CHECK-NEXT: [[TMP9:%.*]] = trunc nuw <16 x i16> [[TMP7]] to <16 x i8> -; CHECK-NEXT: [[TMP10:%.*]] = trunc nuw <16 x i16> [[TMP8]] to <16 x i8> +; CHECK-NEXT: [[TMP5:%.*]] = mul <16 x i16> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = lshr <16 x i16> [[TMP5]], +; CHECK-NEXT: [[TMP9:%.*]] = trunc <16 x i16> [[TMP7]] to <16 x i8> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[INDEX]] to i64 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP11]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 16 -; CHECK-NEXT: store <16 x i8> [[TMP9]], ptr [[TMP12]], align 1 +; CHECK-NEXT: store <16 x i8> [[TMP10]], ptr [[TMP12]], align 1 ; CHECK-NEXT: store <16 x i8> [[TMP10]], ptr [[TMP13]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32 ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992 @@ -244,15 +240,11 @@ define void @test_shrink_select(ptr noalias %src, ptr noalias %dst, i32 %A, i1 % ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[A]] to i16 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i16> poison, i16 [[TMP0]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = mul <16 x i16> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i16> [[TMP2]], -; CHECK-NEXT: [[TMP5:%.*]] = lshr <16 x i16> [[TMP3]], -; CHECK-NEXT: [[TMP6:%.*]] = lshr <16 x i16> [[TMP4]], -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[C]], <16 x i16> [[TMP5]], <16 x i16> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[C]], <16 x i16> [[TMP6]], <16 x i16> [[TMP4]] +; CHECK-NEXT: [[TMP3:%.*]] = mul <16 x i16> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[TMP3]], <16 x i16> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = lshr <16 x i16> [[TMP4]], +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[C]], <16 x i16> [[TMP5]], <16 x i16> [[TMP4]] ; CHECK-NEXT: [[TMP9:%.*]] = trunc <16 x i16> [[TMP7]] to <16 x i8> -; CHECK-NEXT: [[TMP10:%.*]] = trunc <16 x i16> [[TMP8]] to <16 x i8> ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll index 631d04dbcb088..623bc9d7f6b83 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll @@ -27,13 +27,9 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) { ; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 ; CHECK-NEXT: [[TMP18:%.*]] = sdiv i64 [[M]], [[CONV6]] -; CHECK-NEXT: [[TMP19:%.*]] = sdiv i64 [[M]], [[CONV6]] ; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP18]] to i32 -; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP19]] to i32 ; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP18]], [[CONV61]] -; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP19]], [[CONV61]] ; CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[X]], [[TMP20]] -; CHECK-NEXT: [[TMP29:%.*]] = mul i32 [[X]], [[TMP21]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] @@ -44,11 +40,11 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) { ; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 1 ; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], [[TMP16]] ; CHECK-NEXT: [[TMP24:%.*]] = sub i64 [[TMP12]], [[TMP22]] -; CHECK-NEXT: [[TMP25:%.*]] = sub i64 [[TMP17]], [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = sub i64 [[TMP17]], [[TMP22]] ; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP24]] to i32 ; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP25]] to i32 ; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP28]], [[TMP26]] -; CHECK-NEXT: [[TMP31:%.*]] = add i32 [[TMP29]], [[TMP27]] +; CHECK-NEXT: [[TMP31:%.*]] = add i32 [[TMP28]], [[TMP27]] ; CHECK-NEXT: [[TMP32:%.*]] = sext i32 [[TMP30]] to i64 ; CHECK-NEXT: [[TMP33:%.*]] = sext i32 [[TMP31]] to i64 ; CHECK-NEXT: [[TMP34:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP32]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll index 651fcf6bedc2d..babbe3a17c460 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll @@ -207,18 +207,16 @@ define void @test_exit_branch_cost(ptr %dst, i64 %x, i32 %y, ptr %dst.1, i1 %c.4 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[X]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP47:%.*]] = icmp eq <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT40:%.*]] = insertelement <2 x i1> poison, i1 [[C_3]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT41:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT40]], <2 x i1> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[C_4]], <2 x i1> [[BROADCAST_SPLAT41]], <2 x i1> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[C_4]], <2 x i1> [[BROADCAST_SPLAT41]], <2 x i1> zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT56:%.*]] = insertelement <2 x i1> poison, i1 [[C_4]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT57:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT56]], <2 x i1> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE74:.*]] ] ; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP1]], +; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP47]], ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 ; CHECK-NEXT: br i1 [[TMP6]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; CHECK: [[PRED_STORE_IF]]: @@ -244,7 +242,7 @@ define void @test_exit_branch_cost(ptr %dst, i64 %x, i32 %y, ptr %dst.1, i1 %c.4 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE47]] ; CHECK: [[PRED_STORE_CONTINUE47]]: ; CHECK-NEXT: [[TMP10:%.*]] = xor <2 x i1> [[TMP2]], -; CHECK-NEXT: [[TMP11:%.*]] = xor <2 x i1> [[TMP3]], +; CHECK-NEXT: [[TMP11:%.*]] = xor <2 x i1> [[TMP2]], ; CHECK-NEXT: [[TMP12:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP10]], <2 x i1> zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = select <2 x i1> [[TMP5]], <2 x i1> [[TMP11]], <2 x i1> zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP12]], i32 0 @@ -276,7 +274,7 @@ define void @test_exit_branch_cost(ptr %dst, i64 %x, i32 %y, ptr %dst.1, i1 %c.4 ; CHECK-NEXT: [[TMP20:%.*]] = select <2 x i1> [[TMP18]], <2 x i1> [[BROADCAST_SPLAT57]], <2 x i1> zeroinitializer ; CHECK-NEXT: [[TMP21:%.*]] = select <2 x i1> [[TMP19]], <2 x i1> [[BROADCAST_SPLAT57]], <2 x i1> zeroinitializer ; CHECK-NEXT: [[TMP22:%.*]] = or <2 x i1> [[TMP47]], [[TMP20]] -; CHECK-NEXT: [[TMP23:%.*]] = or <2 x i1> [[TMP1]], [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = or <2 x i1> [[TMP47]], [[TMP21]] ; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP20]], <2 x i64> zeroinitializer, <2 x i64> ; CHECK-NEXT: [[PREDPHI58:%.*]] = select <2 x i1> [[TMP21]], <2 x i64> zeroinitializer, <2 x i64> ; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x i1> [[TMP22]], i32 0 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll index e6c620018b7db..68fac51971418 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll @@ -22,33 +22,22 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2) ; DEFAULT-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; DEFAULT-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 ; DEFAULT-NEXT: [[TMP13:%.*]] = add i64 [[Y]], 1 -; DEFAULT-NEXT: [[TMP14:%.*]] = add i64 [[Y]], 1 ; DEFAULT-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[SRC_1]], i64 [[TMP13]] -; DEFAULT-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[SRC_1]], i64 [[TMP14]] ; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement poison, i32 [[X]], i64 0 ; DEFAULT-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector [[BROADCAST_SPLATINSERT6]], poison, zeroinitializer ; DEFAULT-NEXT: [[TMP25:%.*]] = lshr [[BROADCAST_SPLAT7]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; DEFAULT-NEXT: [[TMP26:%.*]] = lshr [[BROADCAST_SPLAT7]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; DEFAULT-NEXT: [[TMP31:%.*]] = shl [[BROADCAST_SPLAT7]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; DEFAULT-NEXT: [[TMP32:%.*]] = shl [[BROADCAST_SPLAT7]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; DEFAULT-NEXT: [[TMP33:%.*]] = or [[TMP25]], [[TMP31]] -; DEFAULT-NEXT: [[TMP34:%.*]] = or [[TMP26]], [[TMP32]] ; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement poison, i32 [[Z]], i64 0 ; DEFAULT-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector [[BROADCAST_SPLATINSERT8]], poison, zeroinitializer ; DEFAULT-NEXT: [[TMP39:%.*]] = or [[BROADCAST_SPLAT9]], [[BROADCAST_SPLAT7]] -; DEFAULT-NEXT: [[TMP40:%.*]] = or [[BROADCAST_SPLAT9]], [[BROADCAST_SPLAT7]] ; DEFAULT-NEXT: [[TMP41:%.*]] = and [[TMP39]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; DEFAULT-NEXT: [[TMP42:%.*]] = and [[TMP40]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; DEFAULT-NEXT: [[TMP43:%.*]] = xor [[TMP41]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; DEFAULT-NEXT: [[TMP44:%.*]] = xor [[TMP42]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; DEFAULT-NEXT: [[TMP45:%.*]] = zext [[TMP43]] to -; DEFAULT-NEXT: [[TMP46:%.*]] = zext [[TMP44]] to ; DEFAULT-NEXT: [[TMP61:%.*]] = extractelement [[TMP45]], i32 0 ; DEFAULT-NEXT: [[TMP62:%.*]] = getelementptr i32, ptr [[SRC_2]], i64 [[TMP61]] ; DEFAULT-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, ptr [[TMP62]], i64 0 ; DEFAULT-NEXT: [[TMP47:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; DEFAULT-NEXT: [[DOTSPLATINSERT3:%.*]] = insertelement poison, ptr [[TMP62]], i64 0 -; DEFAULT-NEXT: [[TMP48:%.*]] = shufflevector [[DOTSPLATINSERT3]], poison, zeroinitializer ; DEFAULT-NEXT: [[TMP7:%.*]] = call i32 @llvm.vscale.i32() ; DEFAULT-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], 4 ; DEFAULT-NEXT: [[TMP9:%.*]] = sub i32 [[TMP8]], 1 @@ -78,11 +67,11 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2) ; DEFAULT-NEXT: [[TMP29:%.*]] = or [[TMP27]], shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) ; DEFAULT-NEXT: [[TMP30:%.*]] = or [[TMP28]], shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) ; DEFAULT-NEXT: [[TMP35:%.*]] = or [[TMP33]], [[TMP29]] -; DEFAULT-NEXT: [[TMP36:%.*]] = or [[TMP34]], [[TMP30]] +; DEFAULT-NEXT: [[TMP36:%.*]] = or [[TMP33]], [[TMP30]] ; DEFAULT-NEXT: [[TMP37:%.*]] = or [[TMP35]], [[BROADCAST_SPLAT7]] ; DEFAULT-NEXT: [[TMP38:%.*]] = or [[TMP36]], [[BROADCAST_SPLAT7]] ; DEFAULT-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP47]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; DEFAULT-NEXT: [[WIDE_MASKED_GATHER10:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP48]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; DEFAULT-NEXT: [[WIDE_MASKED_GATHER10:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP47]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) ; DEFAULT-NEXT: [[TMP49:%.*]] = lshr [[TMP37]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; DEFAULT-NEXT: [[TMP50:%.*]] = lshr [[TMP38]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; DEFAULT-NEXT: [[TMP51:%.*]] = zext [[TMP49]] to diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll index d62e5991da0ad..5a064dc45551e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll @@ -169,9 +169,8 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[TMP4]], i64 0 ; DEFAULT-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer ; DEFAULT-NEXT: [[TMP5:%.*]] = trunc <16 x i64> [[BROADCAST_SPLAT2]] to <16 x i8> -; DEFAULT-NEXT: [[TMP6:%.*]] = trunc <16 x i64> [[BROADCAST_SPLAT2]] to <16 x i8> ; DEFAULT-NEXT: [[TMP8:%.*]] = and <16 x i8> [[TMP5]], [[TMP7]] -; DEFAULT-NEXT: [[TMP9:%.*]] = and <16 x i8> [[TMP6]], [[TMP7]] +; DEFAULT-NEXT: [[TMP9:%.*]] = and <16 x i8> [[TMP5]], [[TMP7]] ; DEFAULT-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP2]] ; DEFAULT-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] ; DEFAULT-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll index 6a4ff7f6003e0..2da2269023e27 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll @@ -20,15 +20,14 @@ define void @induction_i7(ptr %dst) #0 { ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP40:%.*]] = mul i64 [[TMP4]], 2 ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP40]], 2 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP40]], i64 0 +; CHECK-NEXT: [[DOTSPLAT_:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[DOTSPLAT:%.*]] = trunc [[DOTSPLAT_]] to ; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i8() ; CHECK-NEXT: [[TMP7:%.*]] = trunc [[TMP6]] to ; CHECK-NEXT: [[TMP8:%.*]] = add [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = mul [[TMP8]], shufflevector ( insertelement ( poison, i7 1, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP40]] to i7 -; CHECK-NEXT: [[TMP12:%.*]] = mul i7 1, [[TMP11]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i7 [[TMP12]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] @@ -94,15 +93,14 @@ define void @induction_i3_zext(ptr %dst) #0 { ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP40:%.*]] = mul i64 [[TMP4]], 2 ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP40]], 2 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP40]], i64 0 +; CHECK-NEXT: [[DOTSPLAT_:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[DOTSPLAT:%.*]] = trunc [[DOTSPLAT_]] to ; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i8() ; CHECK-NEXT: [[TMP7:%.*]] = trunc [[TMP6]] to ; CHECK-NEXT: [[TMP8:%.*]] = add [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = mul [[TMP8]], shufflevector ( insertelement ( poison, i3 1, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP40]] to i3 -; CHECK-NEXT: [[TMP12:%.*]] = mul i3 1, [[TMP11]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i3 [[TMP12]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll index bfb5cf8d66627..15819070f1e13 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll @@ -39,10 +39,10 @@ define void @widen_ptr_phi_unrolled(ptr noalias nocapture %a, ptr noalias nocapt ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[C]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP7]] ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[NEXT_GEP]], align 4 -; CHECK-NEXT: [[WIDE_VEC3:%.*]] = load , ptr [[NEXT_GEP2]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) ; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[WIDE_VEC3:%.*]] = load , ptr [[NEXT_GEP2]], align 4 ; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC3]]) ; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 0 ; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll index aa55cd909c569..5005506e38334 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll @@ -432,12 +432,12 @@ define void @gather_interleave_group_with_dead_insert_pos(i64 %N, ptr noalias %s ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <32 x i8>, ptr [[TMP9]], align 1 -; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <32 x i8>, ptr [[TMP10]], align 1 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <8 x i32> -; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <32 x i8> [[WIDE_VEC2]], <32 x i8> poison, <8 x i32> ; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <32 x i8>, ptr [[TMP10]], align 1 +; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <32 x i8> [[WIDE_VEC2]], <32 x i8> poison, <8 x i32> ; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <32 x i8> [[WIDE_VEC2]], <32 x i8> poison, <8 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = zext <8 x i8> [[STRIDED_VEC4]] to <8 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = zext <8 x i8> [[STRIDED_VEC5]] to <8 x i32> diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll index 9a2fcf43c8157..9b7681d2d8d16 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll @@ -485,7 +485,6 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 ; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; FIXED-NEXT: [[TMP6:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer -; FIXED-NEXT: [[TMP7:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer ; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -498,11 +497,11 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 ; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 ; FIXED-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> -; FIXED-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> +; FIXED-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> ; FIXED-NEXT: [[TMP10:%.*]] = udiv <4 x i64> [[WIDE_LOAD]], [[TMP8]] ; FIXED-NEXT: [[TMP11:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], [[TMP9]] ; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP10]], <4 x i64> [[WIDE_LOAD]] -; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP11]], <4 x i64> [[WIDE_LOAD1]] +; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP11]], <4 x i64> [[WIDE_LOAD1]] ; FIXED-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP4]], align 8 ; FIXED-NEXT: store <4 x i64> [[PREDPHI2]], ptr [[TMP5]], align 8 ; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 @@ -616,7 +615,6 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 ; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; FIXED-NEXT: [[TMP6:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer -; FIXED-NEXT: [[TMP7:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer ; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -629,11 +627,11 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 ; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 ; FIXED-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> -; FIXED-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> +; FIXED-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> ; FIXED-NEXT: [[TMP10:%.*]] = sdiv <4 x i64> [[WIDE_LOAD]], [[TMP8]] ; FIXED-NEXT: [[TMP11:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], [[TMP9]] ; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP10]], <4 x i64> [[WIDE_LOAD]] -; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP11]], <4 x i64> [[WIDE_LOAD1]] +; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP11]], <4 x i64> [[WIDE_LOAD1]] ; FIXED-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP4]], align 8 ; FIXED-NEXT: store <4 x i64> [[PREDPHI2]], ptr [[TMP5]], align 8 ; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll index 87bc77cb7767f..79c7e4b64c30b 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll @@ -1227,12 +1227,12 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP2]] ; FIXED-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP3]] ; FIXED-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0 -; FIXED-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0 ; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP6]], align 4 -; FIXED-NEXT: [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP7]], align 4 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> ; FIXED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> +; FIXED-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0 +; FIXED-NEXT: [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP7]], align 4 +; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> ; FIXED-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> ; FIXED-NEXT: [[TMP8:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC3]] ; FIXED-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], [[STRIDED_VEC4]] @@ -1415,12 +1415,12 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]] ; FIXED-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]] ; FIXED-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0 -; FIXED-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0 ; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP6]], align 8 -; FIXED-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> ; FIXED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> +; FIXED-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0 +; FIXED-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8 +; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> ; FIXED-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> ; FIXED-NEXT: [[TMP8:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC3]] ; FIXED-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC4]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index 349fd13a58d29..b6a9fed507acd 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -136,7 +136,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in ; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop ; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1 -; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF>=1' { +; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF={1}' { ; CHECK-NEXT: Live-in vp<%0> = VF * UF ; CHECK-NEXT: Live-in vp<%1> = vector-trip-count ; CHECK-NEXT: vp<%2> = original trip-count @@ -340,7 +340,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in ; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop ; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1 -; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF>=1' { +; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF={1}' { ; CHECK-NEXT: Live-in vp<%0> = VF * UF ; CHECK-NEXT: Live-in vp<%1> = vector-trip-count ; CHECK-NEXT: vp<%2> = original trip-count diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll index 957df15ee4583..b7c9392c5bf7a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll @@ -963,9 +963,6 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias ; FIXEDLEN: vector.body: ; FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; FIXEDLEN-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 -; FIXEDLEN-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 ; FIXEDLEN-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 ; FIXEDLEN-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 ; FIXEDLEN-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll index d66b3fc0da172..1f68b4a8fdde0 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll @@ -13,66 +13,65 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) { ; IF-EVL-LABEL: @interleave( ; IF-EVL-NEXT: entry: -; IF-EVL-NEXT: [[TMP17:%.*]] = sub i64 -1, [[N:%.*]] -; IF-EVL-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP31]], 8 -; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP17]], [[TMP2]] +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 8 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] ; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; IF-EVL: vector.ph: ; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 -; IF-EVL-NEXT: [[TMP32:%.*]] = sub i64 [[TMP5]], 1 -; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP32]] +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 -; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 -; IF-EVL-NEXT: [[TMP101:%.*]] = mul i64 [[TMP10]], 2 -; IF-EVL-NEXT: [[TMP11:%.*]] = call @llvm.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP12:%.*]] = add [[TMP11]], zeroinitializer -; IF-EVL-NEXT: [[TMP13:%.*]] = mul [[TMP12]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; IF-EVL-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP13]] -; IF-EVL-NEXT: [[TMP37:%.*]] = mul i64 1, [[TMP10]] -; IF-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP37]], i64 0 -; IF-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; IF-EVL-NEXT: [[TMP10:%.*]] = call @llvm.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP11:%.*]] = add [[TMP10]], zeroinitializer +; IF-EVL-NEXT: [[TMP12:%.*]] = mul [[TMP11]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; IF-EVL-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP12]] +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 ; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[STEP_ADD:%.*]] = add [[VEC_IND]], [[DOTSPLAT]] -; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; IF-EVL-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 -; IF-EVL-NEXT: [[TMP38:%.*]] = add i64 [[TMP19]], 0 -; IF-EVL-NEXT: [[TMP39:%.*]] = mul i64 [[TMP38]], 1 -; IF-EVL-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], [[TMP39]] -; IF-EVL-NEXT: [[TMP23:%.*]] = icmp ule [[VEC_IND]], [[BROADCAST_SPLAT]] -; IF-EVL-NEXT: [[TMP24:%.*]] = icmp ule [[STEP_ADD]], [[BROADCAST_SPLAT]] -; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], [[VEC_IND]], i32 0 -; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], [[STEP_ADD]], i32 0 -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP25]], i32 4, [[TMP23]], poison) -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP26]], i32 4, [[TMP24]], poison) -; IF-EVL-NEXT: [[TMP27:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], [[VEC_IND]], i32 1 -; IF-EVL-NEXT: [[TMP28:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], [[STEP_ADD]], i32 1 -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP27]], i32 4, [[TMP23]], poison) -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP28]], i32 4, [[TMP24]], poison) -; IF-EVL-NEXT: [[TMP29:%.*]] = add nsw [[WIDE_MASKED_GATHER3]], [[WIDE_MASKED_GATHER]] -; IF-EVL-NEXT: [[TMP30:%.*]] = add nsw [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER2]] -; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]] -; IF-EVL-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 -; IF-EVL-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 4 -; IF-EVL-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 [[TMP35]] -; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP29]], ptr [[TMP33]], i32 4, [[TMP23]]) -; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP30]], ptr [[TMP36]], i32 4, [[TMP24]]) -; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP101]] -; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add [[STEP_ADD]], [[DOTSPLAT]] -; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-NEXT: [[STEP_ADD:%.*]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] +; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0 +; IF-EVL-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 +; IF-EVL-NEXT: [[TMP16:%.*]] = add i64 [[TMP15]], 0 +; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 1 +; IF-EVL-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], [[TMP17]] +; IF-EVL-NEXT: [[TMP19:%.*]] = icmp ule [[VEC_IND]], [[BROADCAST_SPLAT2]] +; IF-EVL-NEXT: [[TMP20:%.*]] = icmp ule [[STEP_ADD]], [[BROADCAST_SPLAT2]] +; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], [[VEC_IND]], i32 0 +; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], [[STEP_ADD]], i32 0 +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP21]], i32 4, [[TMP19]], poison) +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP22]], i32 4, [[TMP20]], poison) +; IF-EVL-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], [[VEC_IND]], i32 1 +; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], [[STEP_ADD]], i32 1 +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP23]], i32 4, [[TMP19]], poison) +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP24]], i32 4, [[TMP20]], poison) +; IF-EVL-NEXT: [[TMP25:%.*]] = add nsw [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER]] +; IF-EVL-NEXT: [[TMP26:%.*]] = add nsw [[WIDE_MASKED_GATHER5]], [[WIDE_MASKED_GATHER3]] +; IF-EVL-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP13]] +; IF-EVL-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]] +; IF-EVL-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 0 +; IF-EVL-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP31:%.*]] = mul i64 [[TMP30]], 4 +; IF-EVL-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i64 [[TMP31]] +; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP25]], ptr [[TMP29]], i32 4, [[TMP19]]) +; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP26]], ptr [[TMP32]], i32 4, [[TMP20]]) +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add [[STEP_ADD]], [[BROADCAST_SPLAT]] +; IF-EVL-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: @@ -81,10 +80,10 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) { ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 0 -; IF-EVL-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[TMP34:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 1 -; IF-EVL-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 -; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP21]] +; IF-EVL-NEXT: [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP35]], [[TMP34]] ; IF-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] ; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 @@ -118,16 +117,16 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) { ; NO-VP-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[TMP6]], i32 0 ; NO-VP-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[TMP11]], i32 0 ; NO-VP-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 -; NO-VP-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 ; NO-VP-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP14]], align 4 -; NO-VP-NEXT: [[WIDE_VEC1:%.*]] = load , ptr [[TMP15]], align 4 ; NO-VP-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) -; NO-VP-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; NO-VP-NEXT: [[TMP17:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; NO-VP-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; NO-VP-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; NO-VP-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 +; NO-VP-NEXT: [[WIDE_VEC1:%.*]] = load , ptr [[TMP17]], align 4 ; NO-VP-NEXT: [[STRIDED_VEC2:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC1]]) ; NO-VP-NEXT: [[TMP18:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 0 ; NO-VP-NEXT: [[TMP19:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 1 -; NO-VP-NEXT: [[TMP20:%.*]] = add nsw [[TMP17]], [[TMP16]] +; NO-VP-NEXT: [[TMP20:%.*]] = add nsw [[TMP16]], [[TMP15]] ; NO-VP-NEXT: [[TMP21:%.*]] = add nsw [[TMP19]], [[TMP18]] ; NO-VP-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] ; NO-VP-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll index f1f285eda38a6..e8adccca15ddb 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll @@ -562,16 +562,16 @@ define void @cost_duplicate_recipe_for_sinking(ptr %A, i64 %N) #2 { ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP10]] ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr double, ptr [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr double, ptr [[TMP12]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr double, ptr [[TMP13]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr double, ptr [[TMP14]], i32 0 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x double>, ptr [[TMP15]], align 8 -; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <16 x double>, ptr [[TMP16]], align 8 -; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <16 x double>, ptr [[TMP17]], align 8 -; CHECK-NEXT: [[WIDE_VEC3:%.*]] = load <16 x double>, ptr [[TMP18]], align 8 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr double, ptr [[TMP12]], i32 0 +; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <16 x double>, ptr [[TMP16]], align 8 ; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x double> [[WIDE_VEC1]], <16 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr double, ptr [[TMP13]], i32 0 +; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <16 x double>, ptr [[TMP17]], align 8 ; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x double> [[WIDE_VEC2]], <16 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr double, ptr [[TMP14]], i32 0 +; CHECK-NEXT: [[WIDE_VEC3:%.*]] = load <16 x double>, ptr [[TMP18]], align 8 ; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x double> [[WIDE_VEC3]], <16 x double> poison, <4 x i32> ; CHECK-NEXT: [[TMP19:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC]], zeroinitializer ; CHECK-NEXT: [[TMP20:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC4]], zeroinitializer @@ -783,9 +783,6 @@ define i64 @cost_assume(ptr %end, i64 %N) { ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[N:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -800,15 +797,12 @@ define i64 @cost_assume(ptr %end, i64 %N) { ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP12]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP12]]) -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP13]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP13]]) -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP14]]) -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP14]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] @@ -866,9 +860,7 @@ define void @reduction_store(ptr noalias %src, ptr %dst, i1 %x) #2 { ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP0:%.*]] = zext <4 x i1> [[BROADCAST_SPLAT]] to <4 x i64> ; CHECK-NEXT: [[TMP1:%.*]] = lshr <4 x i64> [[TMP0]], -; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i64> [[TMP0]], -; CHECK-NEXT: [[TMP3:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i64> [[TMP2]] to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i32> ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -880,8 +872,8 @@ define void @reduction_store(ptr noalias %src, ptr %dst, i1 %x) #2 { ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 4 -; CHECK-NEXT: [[TMP11]] = and <4 x i32> [[VEC_PHI]], [[TMP3]] -; CHECK-NEXT: [[TMP12]] = and <4 x i32> [[VEC_PHI1]], [[TMP4]] +; CHECK-NEXT: [[TMP11]] = and <4 x i32> [[VEC_PHI]], [[TMP2]] +; CHECK-NEXT: [[TMP12]] = and <4 x i32> [[VEC_PHI1]], [[TMP2]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 24 ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll index c1be67853bf7c..ba94663178bf4 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll @@ -143,15 +143,17 @@ define void @test_induction_step_needs_expansion(ptr noalias %j, ptr %k, i64 %l, ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[L]], 64 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[L]], [[N_MOD_VF]] +; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <16 x i16> poison, i16 [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[DOTSPLATINSERT2]], <16 x i16> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[DOTSPLAT3:%.*]] = mul <16 x i16> , [[TMP2]] + ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i16> poison, i16 [[TMP0]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i16> [[DOTSPLATINSERT]], <16 x i16> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1:%.*]] = mul <16 x i16> , [[DOTSPLAT]] ; CHECK-NEXT: [[INDUCTION:%.*]] = add <16 x i16> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = mul i16 [[TMP0]], 16 -; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <16 x i16> poison, i16 [[TMP2]], i64 0 -; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <16 x i16> [[DOTSPLATINSERT2]], <16 x i16> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i16> poison, i16 [[OFF]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT]], <16 x i16> poison, <16 x i32> zeroinitializer + ; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16 ; CHECK-NEXT: [[IND_END:%.*]] = mul i16 [[DOTCAST]], [[TMP0]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll index 6ac1e446d13ad..3338cc85772e4 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll @@ -19,10 +19,10 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) { ; SSE-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[DOTIDX]] ; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]] ; SSE-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4 -; SSE-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4 ; SSE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; SSE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> +; SSE-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4 +; SSE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC]] ; SSE-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC2]] @@ -61,16 +61,16 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) { ; AVX1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP3]] ; AVX1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP5]] ; AVX1-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4 -; AVX1-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4 -; AVX1-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i32>, ptr [[TMP8]], align 4 -; AVX1-NEXT: [[WIDE_VEC3:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4 ; AVX1-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <8 x i32> [[WIDE_VEC3]], <8 x i32> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> +; AVX1-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4 +; AVX1-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> +; AVX1-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i32>, ptr [[TMP8]], align 4 +; AVX1-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> +; AVX1-NEXT: [[WIDE_VEC3:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4 +; AVX1-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <8 x i32> [[WIDE_VEC3]], <8 x i32> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <8 x i32> [[WIDE_VEC3]], <8 x i32> poison, <4 x i32> ; AVX1-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC]] ; AVX1-NEXT: [[TMP11:%.*]] = add nsw <4 x i32> [[STRIDED_VEC8]], [[STRIDED_VEC4]] @@ -115,16 +115,16 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) { ; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP3]] ; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP5]] ; AVX2-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP6]], align 4 -; AVX2-NEXT: [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP7]], align 4 -; AVX2-NEXT: [[WIDE_VEC2:%.*]] = load <16 x i32>, ptr [[TMP8]], align 4 -; AVX2-NEXT: [[WIDE_VEC3:%.*]] = load <16 x i32>, ptr [[TMP9]], align 4 ; AVX2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; AVX2-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> -; AVX2-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> -; AVX2-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i32> [[WIDE_VEC3]], <16 x i32> poison, <8 x i32> ; AVX2-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> +; AVX2-NEXT: [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP7]], align 4 +; AVX2-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> ; AVX2-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> +; AVX2-NEXT: [[WIDE_VEC2:%.*]] = load <16 x i32>, ptr [[TMP8]], align 4 +; AVX2-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> ; AVX2-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> +; AVX2-NEXT: [[WIDE_VEC3:%.*]] = load <16 x i32>, ptr [[TMP9]], align 4 +; AVX2-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i32> [[WIDE_VEC3]], <16 x i32> poison, <8 x i32> ; AVX2-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <16 x i32> [[WIDE_VEC3]], <16 x i32> poison, <8 x i32> ; AVX2-NEXT: [[TMP10:%.*]] = add nsw <8 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC]] ; AVX2-NEXT: [[TMP11:%.*]] = add nsw <8 x i32> [[STRIDED_VEC8]], [[STRIDED_VEC4]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll index a53bd92263191..89c0cafde062a 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll @@ -185,10 +185,10 @@ define void @test_scalar_cost_single_store_loop_varying_cond(ptr %dst, ptr noali ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP6]], align 4 -; CHECK-NEXT: [[WIDE_VEC4:%.*]] = load <16 x i32>, ptr [[TMP7]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_VEC4:%.*]] = load <16 x i32>, ptr [[TMP7]], align 4 ; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x i32> [[WIDE_VEC4]], <16 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i32> [[STRIDED_VEC]], ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <4 x i32> [[STRIDED_VEC5]], diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll index 7cbf0ab025206..7b16665a416d4 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll @@ -101,24 +101,24 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon ; SSE41-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[S1:%.*]], i64 [[TMP2]] ; SSE41-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP3]] ; SSE41-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 0 -; SSE41-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 ; SSE41-NEXT: [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP6]], align 2 -; SSE41-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i16>, ptr [[TMP7]], align 2 ; SSE41-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> -; SSE41-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> ; SSE41-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> +; SSE41-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 +; SSE41-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i16>, ptr [[TMP7]], align 2 +; SSE41-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> ; SSE41-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> ; SSE41-NEXT: [[TMP8:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32> ; SSE41-NEXT: [[TMP9:%.*]] = sext <4 x i16> [[STRIDED_VEC2]] to <4 x i32> ; SSE41-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP2]] ; SSE41-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP3]] ; SSE41-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0 -; SSE41-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0 ; SSE41-NEXT: [[WIDE_VEC5:%.*]] = load <8 x i16>, ptr [[TMP12]], align 2 -; SSE41-NEXT: [[WIDE_VEC6:%.*]] = load <8 x i16>, ptr [[TMP13]], align 2 ; SSE41-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <8 x i16> [[WIDE_VEC5]], <8 x i16> poison, <4 x i32> -; SSE41-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <8 x i16> [[WIDE_VEC6]], <8 x i16> poison, <4 x i32> ; SSE41-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <8 x i16> [[WIDE_VEC5]], <8 x i16> poison, <4 x i32> +; SSE41-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0 +; SSE41-NEXT: [[WIDE_VEC6:%.*]] = load <8 x i16>, ptr [[TMP13]], align 2 +; SSE41-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <8 x i16> [[WIDE_VEC6]], <8 x i16> poison, <4 x i32> ; SSE41-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <8 x i16> [[WIDE_VEC6]], <8 x i16> poison, <4 x i32> ; SSE41-NEXT: [[TMP14:%.*]] = sext <4 x i16> [[STRIDED_VEC7]] to <4 x i32> ; SSE41-NEXT: [[TMP15:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32> @@ -203,20 +203,20 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon ; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP6]] ; AVX1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP7]] ; AVX1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 0 -; AVX1-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[TMP9]], i32 0 -; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0 -; AVX1-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0 ; AVX1-NEXT: [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP12]], align 2 -; AVX1-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i16>, ptr [[TMP13]], align 2 -; AVX1-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP14]], align 2 -; AVX1-NEXT: [[WIDE_VEC3:%.*]] = load <8 x i16>, ptr [[TMP15]], align 2 ; AVX1-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <8 x i16> [[WIDE_VEC3]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[TMP9]], i32 0 +; AVX1-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i16>, ptr [[TMP13]], align 2 +; AVX1-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0 +; AVX1-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP14]], align 2 +; AVX1-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0 +; AVX1-NEXT: [[WIDE_VEC3:%.*]] = load <8 x i16>, ptr [[TMP15]], align 2 +; AVX1-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <8 x i16> [[WIDE_VEC3]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <8 x i16> [[WIDE_VEC3]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[TMP16:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32> ; AVX1-NEXT: [[TMP17:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32> @@ -227,20 +227,20 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon ; AVX1-NEXT: [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP6]] ; AVX1-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP7]] ; AVX1-NEXT: [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[TMP20]], i32 0 -; AVX1-NEXT: [[TMP25:%.*]] = getelementptr inbounds i16, ptr [[TMP21]], i32 0 -; AVX1-NEXT: [[TMP26:%.*]] = getelementptr inbounds i16, ptr [[TMP22]], i32 0 -; AVX1-NEXT: [[TMP27:%.*]] = getelementptr inbounds i16, ptr [[TMP23]], i32 0 ; AVX1-NEXT: [[WIDE_VEC11:%.*]] = load <8 x i16>, ptr [[TMP24]], align 2 -; AVX1-NEXT: [[WIDE_VEC12:%.*]] = load <8 x i16>, ptr [[TMP25]], align 2 -; AVX1-NEXT: [[WIDE_VEC13:%.*]] = load <8 x i16>, ptr [[TMP26]], align 2 -; AVX1-NEXT: [[WIDE_VEC14:%.*]] = load <8 x i16>, ptr [[TMP27]], align 2 ; AVX1-NEXT: [[STRIDED_VEC15:%.*]] = shufflevector <8 x i16> [[WIDE_VEC11]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC16:%.*]] = shufflevector <8 x i16> [[WIDE_VEC12]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC17:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC18:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC19:%.*]] = shufflevector <8 x i16> [[WIDE_VEC11]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP25:%.*]] = getelementptr inbounds i16, ptr [[TMP21]], i32 0 +; AVX1-NEXT: [[WIDE_VEC12:%.*]] = load <8 x i16>, ptr [[TMP25]], align 2 +; AVX1-NEXT: [[STRIDED_VEC16:%.*]] = shufflevector <8 x i16> [[WIDE_VEC12]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC20:%.*]] = shufflevector <8 x i16> [[WIDE_VEC12]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP26:%.*]] = getelementptr inbounds i16, ptr [[TMP22]], i32 0 +; AVX1-NEXT: [[WIDE_VEC13:%.*]] = load <8 x i16>, ptr [[TMP26]], align 2 +; AVX1-NEXT: [[STRIDED_VEC17:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC21:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP27:%.*]] = getelementptr inbounds i16, ptr [[TMP23]], i32 0 +; AVX1-NEXT: [[WIDE_VEC14:%.*]] = load <8 x i16>, ptr [[TMP27]], align 2 +; AVX1-NEXT: [[STRIDED_VEC18:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC22:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[TMP28:%.*]] = sext <4 x i16> [[STRIDED_VEC15]] to <4 x i32> ; AVX1-NEXT: [[TMP29:%.*]] = sext <4 x i16> [[STRIDED_VEC16]] to <4 x i32> diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll index fda7655cd9cf8..f4865c816f253 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll @@ -115,13 +115,7 @@ define i32 @uniform_address(ptr align(4) %addr, i32 %byte_offset) { ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = udiv i32 [[BYTE_OFFSET:%.*]], 4 -; CHECK-NEXT: [[TMP1:%.*]] = udiv i32 [[BYTE_OFFSET]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[BYTE_OFFSET]], 4 -; CHECK-NEXT: [[TMP3:%.*]] = udiv i32 [[BYTE_OFFSET]], 4 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[ADDR:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[ADDR]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[ADDR]], i32 [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[ADDR]], i32 [[TMP3]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -211,28 +205,19 @@ define void @uniform_store_varying_value(ptr align(4) %addr) { ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32 +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i32 ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP0]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 2 -; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP0]], 3 -; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP0]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 5 -; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP0]], 6 -; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP0]], 7 -; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP0]], 8 -; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP0]], 9 -; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP0]], 10 -; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP0]], 11 -; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP0]], 12 -; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP0]], 13 -; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP0]], 14 -; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP0]], 15 -; CHECK-NEXT: store i32 [[TMP16]], ptr [[ADDR:%.*]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 16 -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP0]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 8 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP0]], 12 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP0]], 13 +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 14 +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP0]], 15 +; CHECK-NEXT: store i32 [[TMP7]], ptr [[ADDR:%.*]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 false, label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-interleave-only.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-interleave-only.ll new file mode 100644 index 0000000000000..b6d0b964cb608 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-interleave-only.ll @@ -0,0 +1,70 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -p loop-vectorize -force-vector-width=1 -force-vector-interleave=2 -S %s | FileCheck %s + +define float @for_load_interleave_only(ptr %src) { +; CHECK-LABEL: define float @for_load_interleave_only( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[SRC]], i64 16000 +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi float [ 0.000000e+00, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 16 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP0]] +; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[NEXT_GEP]], align 4 +; CHECK-NEXT: [[TMP3]] = load float, ptr [[NEXT_GEP2]], align 4 +; CHECK-NEXT: store float 0.000000e+00, ptr [[NEXT_GEP]], align 4 +; CHECK-NEXT: store float 0.000000e+00, ptr [[NEXT_GEP2]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1001, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[SRC]], %[[ENTRY]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[FOR:%.*]] = phi float [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[L:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 16 +; CHECK-NEXT: [[L]] = load float, ptr [[PTR_IV]], align 4 +; CHECK-NEXT: store float 0.000000e+00, ptr [[PTR_IV]], align 4 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV]], 1000 +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[FOR_LCSSA:%.*]] = phi float [ [[FOR]], %[[LOOP]] ], [ [[TMP2]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret float [[FOR_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop ] + %ptr.iv = phi ptr [ %src, %entry ], [ %ptr.iv.next, %loop ] + %for = phi float [ 0.000000e+00, %entry ], [ %l, %loop ] + %iv.next = add i32 %iv, 1 + %ptr.iv.next = getelementptr i8, ptr %ptr.iv, i64 16 + %l = load float, ptr %ptr.iv, align 4 + store float 0.000000e+00, ptr %ptr.iv, align 4 + %ec = icmp eq i32 %iv, 1000 + br i1 %ec, label %exit, label %loop + +exit: + ret float %for +} +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll index b54d3cd7dd185..79979790716fe 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -1291,19 +1291,18 @@ define i64 @constant_folded_previous_value() { ; UNROLL-NO-VF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; UNROLL-NO-VF: vector.ph: ; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = add i64 0, 1 -; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = add i64 0, 1 ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: ; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP1]], [[VECTOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP0]], [[VECTOR_BODY]] ] ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 -; UNROLL-NO-VF-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; UNROLL-NO-VF-NEXT: br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: ; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; UNROLL-NO-VF-NEXT: br label [[SCALAR_BODY:%.*]] ; UNROLL-NO-VF: scalar.body: ; UNROLL-NO-VF-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[SCALAR_BODY]] ] @@ -2546,7 +2545,7 @@ define void @sink_dead_inst(ptr %a) { ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ , [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR2:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR1:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], ; UNROLL-NO-IC-NEXT: [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16 ; UNROLL-NO-IC-NEXT: [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]] @@ -2574,17 +2573,17 @@ define void @sink_dead_inst(ptr %a) { ; UNROLL-NO-IC-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP7]], i32 3 -; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3 +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3 ; UNROLL-NO-IC-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-IC: scalar.ph: ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 13, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ] ; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT4:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT3]], [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY]] ] +; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT3:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT2]], [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY]] ] ; UNROLL-NO-IC-NEXT: br label [[FOR_COND:%.*]] ; UNROLL-NO-IC: for.cond: ; UNROLL-NO-IC-NEXT: [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_COND]] ] ; UNROLL-NO-IC-NEXT: [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], [[FOR_COND]] ] -; UNROLL-NO-IC-NEXT: [[REC_2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT4]], [[SCALAR_PH]] ], [ [[REC_2_PREV:%.*]], [[FOR_COND]] ] +; UNROLL-NO-IC-NEXT: [[REC_2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT3]], [[SCALAR_PH]] ], [ [[REC_2_PREV:%.*]], [[FOR_COND]] ] ; UNROLL-NO-IC-NEXT: [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10 ; UNROLL-NO-IC-NEXT: [[CMP:%.*]] = icmp eq i32 [[REC_2]], 15 ; UNROLL-NO-IC-NEXT: [[IV_NEXT]] = add i16 [[IV]], 1 @@ -2728,21 +2727,21 @@ define i32 @sink_into_replication_region(i32 %y) { ; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] ; UNROLL-NO-IC-NEXT: [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1 -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0 -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer +; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0 +; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT5]], <4 x i32> poison, <4 x i32> zeroinitializer ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: -; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE18:%.*]] ] -; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[PRED_UDIV_CONTINUE18]] ] -; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_UDIV_CONTINUE18]] ] -; UNROLL-NO-IC-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[PRED_UDIV_CONTINUE18]] ] +; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE20:%.*]] ] +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[PRED_UDIV_CONTINUE20]] ] +; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_UDIV_CONTINUE20]] ] +; UNROLL-NO-IC-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[PRED_UDIV_CONTINUE20]] ] ; UNROLL-NO-IC-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[Y]], [[INDEX]] ; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i64 0 ; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; UNROLL-NO-IC-NEXT: [[VEC_IV:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], -; UNROLL-NO-IC-NEXT: [[VEC_IV2:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], -; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = icmp ule <4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT4]] -; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = icmp ule <4 x i32> [[VEC_IV2]], [[BROADCAST_SPLAT4]] +; UNROLL-NO-IC-NEXT: [[VEC_IV4:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], +; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = icmp ule <4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT6]] +; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = icmp ule <4 x i32> [[VEC_IV4]], [[BROADCAST_SPLAT6]] ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0 ; UNROLL-NO-IC-NEXT: br i1 [[TMP4]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; UNROLL-NO-IC: pred.udiv.if: @@ -2753,68 +2752,68 @@ define i32 @sink_into_replication_region(i32 %y) { ; UNROLL-NO-IC: pred.udiv.continue: ; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_UDIV_IF]] ] ; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1 -; UNROLL-NO-IC-NEXT: br i1 [[TMP9]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]] -; UNROLL-NO-IC: pred.udiv.if5: +; UNROLL-NO-IC-NEXT: br i1 [[TMP9]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]] +; UNROLL-NO-IC: pred.udiv.if7: ; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = add i32 [[OFFSET_IDX]], -1 ; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = udiv i32 219220132, [[TMP10]] ; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP11]], i32 1 -; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE6]] -; UNROLL-NO-IC: pred.udiv.continue6: -; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP12]], [[PRED_UDIV_IF5]] ] +; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE8]] +; UNROLL-NO-IC: pred.udiv.continue8: +; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP12]], [[PRED_UDIV_IF7]] ] ; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2 -; UNROLL-NO-IC-NEXT: br i1 [[TMP14]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]] -; UNROLL-NO-IC: pred.udiv.if7: +; UNROLL-NO-IC-NEXT: br i1 [[TMP14]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10:%.*]] +; UNROLL-NO-IC: pred.udiv.if9: ; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = add i32 [[OFFSET_IDX]], -2 ; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = udiv i32 219220132, [[TMP15]] ; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP16]], i32 2 -; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE8]] -; UNROLL-NO-IC: pred.udiv.continue8: -; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP13]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP17]], [[PRED_UDIV_IF7]] ] +; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE10]] +; UNROLL-NO-IC: pred.udiv.continue10: +; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP13]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP17]], [[PRED_UDIV_IF9]] ] ; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3 -; UNROLL-NO-IC-NEXT: br i1 [[TMP19]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10:%.*]] -; UNROLL-NO-IC: pred.udiv.if9: +; UNROLL-NO-IC-NEXT: br i1 [[TMP19]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]] +; UNROLL-NO-IC: pred.udiv.if11: ; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = add i32 [[OFFSET_IDX]], -3 ; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = udiv i32 219220132, [[TMP20]] ; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP21]], i32 3 -; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE10]] -; UNROLL-NO-IC: pred.udiv.continue10: -; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP22]], [[PRED_UDIV_IF9]] ] +; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE12]] +; UNROLL-NO-IC: pred.udiv.continue12: +; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_UDIV_CONTINUE10]] ], [ [[TMP22]], [[PRED_UDIV_IF11]] ] ; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 -; UNROLL-NO-IC-NEXT: br i1 [[TMP24]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]] -; UNROLL-NO-IC: pred.udiv.if11: +; UNROLL-NO-IC-NEXT: br i1 [[TMP24]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14:%.*]] +; UNROLL-NO-IC: pred.udiv.if13: ; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = add i32 [[OFFSET_IDX]], -4 ; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = udiv i32 219220132, [[TMP25]] ; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i32 0 -; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE12]] -; UNROLL-NO-IC: pred.udiv.continue12: -; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE10]] ], [ [[TMP27]], [[PRED_UDIV_IF11]] ] +; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE14]] +; UNROLL-NO-IC: pred.udiv.continue14: +; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE12]] ], [ [[TMP27]], [[PRED_UDIV_IF13]] ] ; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1 -; UNROLL-NO-IC-NEXT: br i1 [[TMP29]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14:%.*]] -; UNROLL-NO-IC: pred.udiv.if13: +; UNROLL-NO-IC-NEXT: br i1 [[TMP29]], label [[PRED_UDIV_IF15:%.*]], label [[PRED_UDIV_CONTINUE16:%.*]] +; UNROLL-NO-IC: pred.udiv.if15: ; UNROLL-NO-IC-NEXT: [[TMP30:%.*]] = add i32 [[OFFSET_IDX]], -5 ; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = udiv i32 219220132, [[TMP30]] ; UNROLL-NO-IC-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP31]], i32 1 -; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE14]] -; UNROLL-NO-IC: pred.udiv.continue14: -; UNROLL-NO-IC-NEXT: [[TMP33:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_UDIV_CONTINUE12]] ], [ [[TMP32]], [[PRED_UDIV_IF13]] ] +; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE16]] +; UNROLL-NO-IC: pred.udiv.continue16: +; UNROLL-NO-IC-NEXT: [[TMP33:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP32]], [[PRED_UDIV_IF15]] ] ; UNROLL-NO-IC-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2 -; UNROLL-NO-IC-NEXT: br i1 [[TMP34]], label [[PRED_UDIV_IF15:%.*]], label [[PRED_UDIV_CONTINUE16:%.*]] -; UNROLL-NO-IC: pred.udiv.if15: +; UNROLL-NO-IC-NEXT: br i1 [[TMP34]], label [[PRED_UDIV_IF17:%.*]], label [[PRED_UDIV_CONTINUE18:%.*]] +; UNROLL-NO-IC: pred.udiv.if17: ; UNROLL-NO-IC-NEXT: [[TMP35:%.*]] = add i32 [[OFFSET_IDX]], -6 ; UNROLL-NO-IC-NEXT: [[TMP36:%.*]] = udiv i32 219220132, [[TMP35]] ; UNROLL-NO-IC-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP36]], i32 2 -; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE16]] -; UNROLL-NO-IC: pred.udiv.continue16: -; UNROLL-NO-IC-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP37]], [[PRED_UDIV_IF15]] ] +; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE18]] +; UNROLL-NO-IC: pred.udiv.continue18: +; UNROLL-NO-IC-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_UDIV_CONTINUE16]] ], [ [[TMP37]], [[PRED_UDIV_IF17]] ] ; UNROLL-NO-IC-NEXT: [[TMP39:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 -; UNROLL-NO-IC-NEXT: br i1 [[TMP39]], label [[PRED_UDIV_IF17:%.*]], label [[PRED_UDIV_CONTINUE18]] -; UNROLL-NO-IC: pred.udiv.if17: +; UNROLL-NO-IC-NEXT: br i1 [[TMP39]], label [[PRED_UDIV_IF19:%.*]], label [[PRED_UDIV_CONTINUE20]] +; UNROLL-NO-IC: pred.udiv.if19: ; UNROLL-NO-IC-NEXT: [[TMP40:%.*]] = add i32 [[OFFSET_IDX]], -7 ; UNROLL-NO-IC-NEXT: [[TMP41:%.*]] = udiv i32 219220132, [[TMP40]] ; UNROLL-NO-IC-NEXT: [[TMP42:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP41]], i32 3 -; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE18]] -; UNROLL-NO-IC: pred.udiv.continue18: -; UNROLL-NO-IC-NEXT: [[TMP43]] = phi <4 x i32> [ [[TMP38]], [[PRED_UDIV_CONTINUE16]] ], [ [[TMP42]], [[PRED_UDIV_IF17]] ] +; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE20]] +; UNROLL-NO-IC: pred.udiv.continue20: +; UNROLL-NO-IC-NEXT: [[TMP43]] = phi <4 x i32> [ [[TMP38]], [[PRED_UDIV_CONTINUE18]] ], [ [[TMP42]], [[PRED_UDIV_IF19]] ] ; UNROLL-NO-IC-NEXT: [[TMP44:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP23]], <4 x i32> ; UNROLL-NO-IC-NEXT: [[TMP45:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> [[TMP43]], <4 x i32> ; UNROLL-NO-IC-NEXT: [[TMP46]] = add <4 x i32> [[VEC_PHI]], [[TMP44]] @@ -3037,11 +3036,11 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) { ; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: -; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE31:%.*]] ] -; UNROLL-NO-IC-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE31]] ] -; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[PRED_STORE_CONTINUE31]] ] -; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_STORE_CONTINUE31]] ] -; UNROLL-NO-IC-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[PRED_STORE_CONTINUE31]] ] +; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ] +; UNROLL-NO-IC-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ] +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[PRED_STORE_CONTINUE30]] ] +; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_STORE_CONTINUE30]] ] +; UNROLL-NO-IC-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[PRED_STORE_CONTINUE30]] ] ; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], ; UNROLL-NO-IC-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[Y]], [[INDEX]] ; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], 0 @@ -3063,65 +3062,65 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) { ; UNROLL-NO-IC: pred.udiv.continue: ; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP14]], [[PRED_UDIV_IF]] ] ; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP10]], i32 1 -; UNROLL-NO-IC-NEXT: br i1 [[TMP16]], label [[PRED_UDIV_IF4:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]] -; UNROLL-NO-IC: pred.udiv.if4: +; UNROLL-NO-IC-NEXT: br i1 [[TMP16]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]] +; UNROLL-NO-IC: pred.udiv.if3: ; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = udiv i32 219220132, [[TMP3]] ; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP17]], i32 1 -; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE5]] -; UNROLL-NO-IC: pred.udiv.continue5: -; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = phi <4 x i32> [ [[TMP15]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP18]], [[PRED_UDIV_IF4]] ] +; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE4]] +; UNROLL-NO-IC: pred.udiv.continue4: +; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = phi <4 x i32> [ [[TMP15]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP18]], [[PRED_UDIV_IF3]] ] ; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP10]], i32 2 -; UNROLL-NO-IC-NEXT: br i1 [[TMP20]], label [[PRED_UDIV_IF6:%.*]], label [[PRED_UDIV_CONTINUE7:%.*]] -; UNROLL-NO-IC: pred.udiv.if6: +; UNROLL-NO-IC-NEXT: br i1 [[TMP20]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]] +; UNROLL-NO-IC: pred.udiv.if5: ; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = udiv i32 219220132, [[TMP4]] ; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP21]], i32 2 -; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE7]] -; UNROLL-NO-IC: pred.udiv.continue7: -; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_UDIV_CONTINUE5]] ], [ [[TMP22]], [[PRED_UDIV_IF6]] ] +; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE6]] +; UNROLL-NO-IC: pred.udiv.continue6: +; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_UDIV_CONTINUE4]] ], [ [[TMP22]], [[PRED_UDIV_IF5]] ] ; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP10]], i32 3 -; UNROLL-NO-IC-NEXT: br i1 [[TMP24]], label [[PRED_UDIV_IF8:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]] -; UNROLL-NO-IC: pred.udiv.if8: +; UNROLL-NO-IC-NEXT: br i1 [[TMP24]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]] +; UNROLL-NO-IC: pred.udiv.if7: ; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = udiv i32 219220132, [[TMP5]] ; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP23]], i32 [[TMP25]], i32 3 -; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE9]] -; UNROLL-NO-IC: pred.udiv.continue9: -; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = phi <4 x i32> [ [[TMP23]], [[PRED_UDIV_CONTINUE7]] ], [ [[TMP26]], [[PRED_UDIV_IF8]] ] +; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE8]] +; UNROLL-NO-IC: pred.udiv.continue8: +; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = phi <4 x i32> [ [[TMP23]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP26]], [[PRED_UDIV_IF7]] ] ; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = extractelement <4 x i1> [[TMP11]], i32 0 -; UNROLL-NO-IC-NEXT: br i1 [[TMP28]], label [[PRED_UDIV_IF10:%.*]], label [[PRED_UDIV_CONTINUE11:%.*]] -; UNROLL-NO-IC: pred.udiv.if10: +; UNROLL-NO-IC-NEXT: br i1 [[TMP28]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10:%.*]] +; UNROLL-NO-IC: pred.udiv.if9: ; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = udiv i32 219220132, [[TMP6]] ; UNROLL-NO-IC-NEXT: [[TMP30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP29]], i32 0 -; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE11]] -; UNROLL-NO-IC: pred.udiv.continue11: -; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE9]] ], [ [[TMP30]], [[PRED_UDIV_IF10]] ] +; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE10]] +; UNROLL-NO-IC: pred.udiv.continue10: +; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE8]] ], [ [[TMP30]], [[PRED_UDIV_IF9]] ] ; UNROLL-NO-IC-NEXT: [[TMP32:%.*]] = extractelement <4 x i1> [[TMP11]], i32 1 -; UNROLL-NO-IC-NEXT: br i1 [[TMP32]], label [[PRED_UDIV_IF12:%.*]], label [[PRED_UDIV_CONTINUE13:%.*]] -; UNROLL-NO-IC: pred.udiv.if12: +; UNROLL-NO-IC-NEXT: br i1 [[TMP32]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]] +; UNROLL-NO-IC: pred.udiv.if11: ; UNROLL-NO-IC-NEXT: [[TMP33:%.*]] = udiv i32 219220132, [[TMP7]] ; UNROLL-NO-IC-NEXT: [[TMP34:%.*]] = insertelement <4 x i32> [[TMP31]], i32 [[TMP33]], i32 1 -; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE13]] -; UNROLL-NO-IC: pred.udiv.continue13: -; UNROLL-NO-IC-NEXT: [[TMP35:%.*]] = phi <4 x i32> [ [[TMP31]], [[PRED_UDIV_CONTINUE11]] ], [ [[TMP34]], [[PRED_UDIV_IF12]] ] +; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE12]] +; UNROLL-NO-IC: pred.udiv.continue12: +; UNROLL-NO-IC-NEXT: [[TMP35:%.*]] = phi <4 x i32> [ [[TMP31]], [[PRED_UDIV_CONTINUE10]] ], [ [[TMP34]], [[PRED_UDIV_IF11]] ] ; UNROLL-NO-IC-NEXT: [[TMP36:%.*]] = extractelement <4 x i1> [[TMP11]], i32 2 -; UNROLL-NO-IC-NEXT: br i1 [[TMP36]], label [[PRED_UDIV_IF14:%.*]], label [[PRED_UDIV_CONTINUE15:%.*]] -; UNROLL-NO-IC: pred.udiv.if14: +; UNROLL-NO-IC-NEXT: br i1 [[TMP36]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14:%.*]] +; UNROLL-NO-IC: pred.udiv.if13: ; UNROLL-NO-IC-NEXT: [[TMP37:%.*]] = udiv i32 219220132, [[TMP8]] ; UNROLL-NO-IC-NEXT: [[TMP38:%.*]] = insertelement <4 x i32> [[TMP35]], i32 [[TMP37]], i32 2 -; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE15]] -; UNROLL-NO-IC: pred.udiv.continue15: -; UNROLL-NO-IC-NEXT: [[TMP39:%.*]] = phi <4 x i32> [ [[TMP35]], [[PRED_UDIV_CONTINUE13]] ], [ [[TMP38]], [[PRED_UDIV_IF14]] ] +; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE14]] +; UNROLL-NO-IC: pred.udiv.continue14: +; UNROLL-NO-IC-NEXT: [[TMP39:%.*]] = phi <4 x i32> [ [[TMP35]], [[PRED_UDIV_CONTINUE12]] ], [ [[TMP38]], [[PRED_UDIV_IF13]] ] ; UNROLL-NO-IC-NEXT: [[TMP40:%.*]] = extractelement <4 x i1> [[TMP11]], i32 3 -; UNROLL-NO-IC-NEXT: br i1 [[TMP40]], label [[PRED_UDIV_IF16:%.*]], label [[PRED_UDIV_CONTINUE17:%.*]] -; UNROLL-NO-IC: pred.udiv.if16: +; UNROLL-NO-IC-NEXT: br i1 [[TMP40]], label [[PRED_UDIV_IF15:%.*]], label [[PRED_UDIV_CONTINUE16:%.*]] +; UNROLL-NO-IC: pred.udiv.if15: ; UNROLL-NO-IC-NEXT: [[TMP41:%.*]] = udiv i32 219220132, [[TMP9]] ; UNROLL-NO-IC-NEXT: [[TMP42:%.*]] = insertelement <4 x i32> [[TMP39]], i32 [[TMP41]], i32 3 -; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE17]] -; UNROLL-NO-IC: pred.udiv.continue17: -; UNROLL-NO-IC-NEXT: [[TMP43]] = phi <4 x i32> [ [[TMP39]], [[PRED_UDIV_CONTINUE15]] ], [ [[TMP42]], [[PRED_UDIV_IF16]] ] +; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE16]] +; UNROLL-NO-IC: pred.udiv.continue16: +; UNROLL-NO-IC-NEXT: [[TMP43]] = phi <4 x i32> [ [[TMP39]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP42]], [[PRED_UDIV_IF15]] ] ; UNROLL-NO-IC-NEXT: [[TMP44:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP27]], <4 x i32> ; UNROLL-NO-IC-NEXT: [[TMP45:%.*]] = shufflevector <4 x i32> [[TMP27]], <4 x i32> [[TMP43]], <4 x i32> ; UNROLL-NO-IC-NEXT: [[TMP46]] = add <4 x i32> [[VEC_PHI]], [[TMP44]] -; UNROLL-NO-IC-NEXT: [[TMP47]] = add <4 x i32> [[VEC_PHI3]], [[TMP45]] +; UNROLL-NO-IC-NEXT: [[TMP47]] = add <4 x i32> [[VEC_PHI2]], [[TMP45]] ; UNROLL-NO-IC-NEXT: [[TMP48:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0 ; UNROLL-NO-IC-NEXT: br i1 [[TMP48]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; UNROLL-NO-IC: pred.store.if: @@ -3131,63 +3130,63 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) { ; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE]] ; UNROLL-NO-IC: pred.store.continue: ; UNROLL-NO-IC-NEXT: [[TMP51:%.*]] = extractelement <4 x i1> [[TMP10]], i32 1 -; UNROLL-NO-IC-NEXT: br i1 [[TMP51]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]] -; UNROLL-NO-IC: pred.store.if18: +; UNROLL-NO-IC-NEXT: br i1 [[TMP51]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]] +; UNROLL-NO-IC: pred.store.if17: ; UNROLL-NO-IC-NEXT: [[TMP52:%.*]] = add i32 [[INDEX]], 1 ; UNROLL-NO-IC-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP52]] ; UNROLL-NO-IC-NEXT: store i32 [[TMP3]], ptr [[TMP53]], align 4 -; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE19]] -; UNROLL-NO-IC: pred.store.continue19: +; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE18]] +; UNROLL-NO-IC: pred.store.continue18: ; UNROLL-NO-IC-NEXT: [[TMP54:%.*]] = extractelement <4 x i1> [[TMP10]], i32 2 -; UNROLL-NO-IC-NEXT: br i1 [[TMP54]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]] -; UNROLL-NO-IC: pred.store.if20: +; UNROLL-NO-IC-NEXT: br i1 [[TMP54]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]] +; UNROLL-NO-IC: pred.store.if19: ; UNROLL-NO-IC-NEXT: [[TMP55:%.*]] = add i32 [[INDEX]], 2 ; UNROLL-NO-IC-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP55]] ; UNROLL-NO-IC-NEXT: store i32 [[TMP4]], ptr [[TMP56]], align 4 -; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE21]] -; UNROLL-NO-IC: pred.store.continue21: +; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE20]] +; UNROLL-NO-IC: pred.store.continue20: ; UNROLL-NO-IC-NEXT: [[TMP57:%.*]] = extractelement <4 x i1> [[TMP10]], i32 3 -; UNROLL-NO-IC-NEXT: br i1 [[TMP57]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]] -; UNROLL-NO-IC: pred.store.if22: +; UNROLL-NO-IC-NEXT: br i1 [[TMP57]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]] +; UNROLL-NO-IC: pred.store.if21: ; UNROLL-NO-IC-NEXT: [[TMP58:%.*]] = add i32 [[INDEX]], 3 ; UNROLL-NO-IC-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP58]] ; UNROLL-NO-IC-NEXT: store i32 [[TMP5]], ptr [[TMP59]], align 4 -; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE23]] -; UNROLL-NO-IC: pred.store.continue23: +; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE22]] +; UNROLL-NO-IC: pred.store.continue22: ; UNROLL-NO-IC-NEXT: [[TMP60:%.*]] = extractelement <4 x i1> [[TMP11]], i32 0 -; UNROLL-NO-IC-NEXT: br i1 [[TMP60]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]] -; UNROLL-NO-IC: pred.store.if24: +; UNROLL-NO-IC-NEXT: br i1 [[TMP60]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]] +; UNROLL-NO-IC: pred.store.if23: ; UNROLL-NO-IC-NEXT: [[TMP61:%.*]] = add i32 [[INDEX]], 4 ; UNROLL-NO-IC-NEXT: [[TMP62:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP61]] ; UNROLL-NO-IC-NEXT: store i32 [[TMP6]], ptr [[TMP62]], align 4 -; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE25]] -; UNROLL-NO-IC: pred.store.continue25: +; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE24]] +; UNROLL-NO-IC: pred.store.continue24: ; UNROLL-NO-IC-NEXT: [[TMP63:%.*]] = extractelement <4 x i1> [[TMP11]], i32 1 -; UNROLL-NO-IC-NEXT: br i1 [[TMP63]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]] -; UNROLL-NO-IC: pred.store.if26: +; UNROLL-NO-IC-NEXT: br i1 [[TMP63]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]] +; UNROLL-NO-IC: pred.store.if25: ; UNROLL-NO-IC-NEXT: [[TMP64:%.*]] = add i32 [[INDEX]], 5 ; UNROLL-NO-IC-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP64]] ; UNROLL-NO-IC-NEXT: store i32 [[TMP7]], ptr [[TMP65]], align 4 -; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE27]] -; UNROLL-NO-IC: pred.store.continue27: +; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE26]] +; UNROLL-NO-IC: pred.store.continue26: ; UNROLL-NO-IC-NEXT: [[TMP66:%.*]] = extractelement <4 x i1> [[TMP11]], i32 2 -; UNROLL-NO-IC-NEXT: br i1 [[TMP66]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]] -; UNROLL-NO-IC: pred.store.if28: +; UNROLL-NO-IC-NEXT: br i1 [[TMP66]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]] +; UNROLL-NO-IC: pred.store.if27: ; UNROLL-NO-IC-NEXT: [[TMP67:%.*]] = add i32 [[INDEX]], 6 ; UNROLL-NO-IC-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP67]] ; UNROLL-NO-IC-NEXT: store i32 [[TMP8]], ptr [[TMP68]], align 4 -; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE29]] -; UNROLL-NO-IC: pred.store.continue29: +; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE28]] +; UNROLL-NO-IC: pred.store.continue28: ; UNROLL-NO-IC-NEXT: [[TMP69:%.*]] = extractelement <4 x i1> [[TMP11]], i32 3 -; UNROLL-NO-IC-NEXT: br i1 [[TMP69]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31]] -; UNROLL-NO-IC: pred.store.if30: +; UNROLL-NO-IC-NEXT: br i1 [[TMP69]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]] +; UNROLL-NO-IC: pred.store.if29: ; UNROLL-NO-IC-NEXT: [[TMP70:%.*]] = add i32 [[INDEX]], 7 ; UNROLL-NO-IC-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP70]] ; UNROLL-NO-IC-NEXT: store i32 [[TMP9]], ptr [[TMP71]], align 4 -; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE31]] -; UNROLL-NO-IC: pred.store.continue31: +; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE30]] +; UNROLL-NO-IC: pred.store.continue30: ; UNROLL-NO-IC-NEXT: [[TMP72:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI]] -; UNROLL-NO-IC-NEXT: [[TMP73:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP47]], <4 x i32> [[VEC_PHI3]] +; UNROLL-NO-IC-NEXT: [[TMP73:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP47]], <4 x i32> [[VEC_PHI2]] ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], ; UNROLL-NO-IC-NEXT: [[TMP74:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll index bf1905bf33487..9fbab0abbfd3b 100644 --- a/llvm/test/Transforms/LoopVectorize/float-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll @@ -87,15 +87,15 @@ define void @fp_iv_loop1_fast_FMF(float %init, ptr noalias nocapture %A, i32 %N) ; VEC4_INTERL2-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC4_INTERL2-NEXT: [[TMP1:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]] ; VEC4_INTERL2-NEXT: [[IND_END:%.*]] = fsub fast float [[INIT:%.*]], [[TMP1]] +; VEC4_INTERL2-NEXT: [[FPINC_INS:%.*]] = insertelement <4 x float> poison, float [[FPINC]], i64 0 +; VEC4_INTERL2-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[FPINC_INS]], [[TMP3]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[FPINC]], i64 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[TMP2:%.*]] = fmul fast <4 x float> [[DOTSPLAT3]], ; VEC4_INTERL2-NEXT: [[INDUCTION:%.*]] = fsub fast <4 x float> [[DOTSPLAT]], [[TMP2]] -; VEC4_INTERL2-NEXT: [[TMP3:%.*]] = fmul fast float [[FPINC]], 4.000000e+00 -; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0 -; VEC4_INTERL2-NEXT: [[DOTSPLAT5:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT4]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC4_INTERL2: vector.body: ; VEC4_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -334,15 +334,15 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, ptr noalias nocapture %A, i32 ; VEC4_INTERL2-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC4_INTERL2-NEXT: [[TMP1:%.*]] = fmul reassoc float [[FPINC]], [[DOTCAST]] ; VEC4_INTERL2-NEXT: [[IND_END:%.*]] = fsub reassoc float [[INIT:%.*]], [[TMP1]] +; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[FPINC]], i64 0 +; VEC4_INTERL2-NEXT: [[MUL:%.*]] = fmul reassoc <4 x float> [[DOTSPLATINSERT2]], [[MUL]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[FPINC]], i64 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[TMP2:%.*]] = fmul reassoc <4 x float> [[DOTSPLAT3]], ; VEC4_INTERL2-NEXT: [[INDUCTION:%.*]] = fsub reassoc <4 x float> [[DOTSPLAT]], [[TMP2]] -; VEC4_INTERL2-NEXT: [[TMP3:%.*]] = fmul reassoc float [[FPINC]], 4.000000e+00 -; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0 -; VEC4_INTERL2-NEXT: [[DOTSPLAT5:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT4]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC4_INTERL2: vector.body: ; VEC4_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -841,29 +841,27 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca ; VEC4_INTERL2-NEXT: [[DOTCAST2:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC4_INTERL2-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP0]], [[DOTCAST2]] ; VEC4_INTERL2-NEXT: [[IND_END3:%.*]] = fadd fast float [[INIT:%.*]], [[TMP3]] +; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 +; VEC4_INTERL2-NEXT: [[BROADCAST:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer +; VEC4_INTERL2-NEXT: [[DOTSPLAT5:%.*]] = fmul fast <4 x float> [[BROADCAST]], poison, float [[INIT]], i64 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT6:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT7:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT6]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[TMP4:%.*]] = fmul fast <4 x float> [[DOTSPLAT7]], ; VEC4_INTERL2-NEXT: [[INDUCTION:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], [[TMP4]] -; VEC4_INTERL2-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP0]], 4.000000e+00 -; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT8:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 -; VEC4_INTERL2-NEXT: [[DOTSPLAT9:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT8]], <4 x float> poison, <4 x i32> zeroinitializer -; VEC4_INTERL2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 -; VEC4_INTERL2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC4_INTERL2: vector.body: ; VEC4_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC4_INTERL2-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC4_INTERL2-NEXT: [[VEC_IND10:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT13:%.*]], [[VECTOR_BODY]] ] -; VEC4_INTERL2-NEXT: [[STEP_ADD11:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[DOTSPLAT9]] +; VEC4_INTERL2-NEXT: [[STEP_ADD11:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[DOTSPLAT5]] ; VEC4_INTERL2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] ; VEC4_INTERL2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 16 ; VEC4_INTERL2-NEXT: store <4 x float> [[VEC_IND10]], ptr [[TMP6]], align 4 ; VEC4_INTERL2-NEXT: store <4 x float> [[STEP_ADD11]], ptr [[TMP7]], align 4 -; VEC4_INTERL2-NEXT: [[TMP8:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[BROADCAST_SPLAT]] -; VEC4_INTERL2-NEXT: [[TMP9:%.*]] = fadd fast <4 x float> [[STEP_ADD11]], [[BROADCAST_SPLAT]] +; VEC4_INTERL2-NEXT: [[TMP8:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[BROADCAST]] +; VEC4_INTERL2-NEXT: [[TMP9:%.*]] = fadd fast <4 x float> [[STEP_ADD11]], [[BROADCAST]] ; VEC4_INTERL2-NEXT: [[TMP10:%.*]] = fadd fast <4 x float> [[VEC_IND]], ; VEC4_INTERL2-NEXT: [[TMP11:%.*]] = fadd fast <4 x float> [[VEC_IND]], ; VEC4_INTERL2-NEXT: [[TMP12:%.*]] = fadd fast <4 x float> [[TMP10]], [[TMP8]] @@ -878,7 +876,7 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca ; VEC4_INTERL2-NEXT: store <4 x float> [[TMP11]], ptr [[TMP17]], align 4 ; VEC4_INTERL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; VEC4_INTERL2-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], -; VEC4_INTERL2-NEXT: [[VEC_IND_NEXT13]] = fadd fast <4 x float> [[STEP_ADD11]], [[DOTSPLAT9]] +; VEC4_INTERL2-NEXT: [[VEC_IND_NEXT13]] = fadd fast <4 x float> [[STEP_ADD11]], [[DOTSPLAT5]] ; VEC4_INTERL2-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; VEC4_INTERL2-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; VEC4_INTERL2: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll index 2c827e232b722..59ef4b8b162f7 100644 --- a/llvm/test/Transforms/LoopVectorize/induction.ll +++ b/llvm/test/Transforms/LoopVectorize/induction.ll @@ -1174,14 +1174,14 @@ define float @scalarize_induction_variable_02(ptr %a, ptr %b, i64 %n) { ; INTERLEAVE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[OFFSET_IDX]] ; INTERLEAVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP5]] ; INTERLEAVE-NEXT: [[WIDE_VEC:%.*]] = load <32 x float>, ptr [[TMP6]], align 4 -; INTERLEAVE-NEXT: [[WIDE_VEC2:%.*]] = load <32 x float>, ptr [[TMP7]], align 4 ; INTERLEAVE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <32 x float> [[WIDE_VEC]], <32 x float> poison, <4 x i32> +; INTERLEAVE-NEXT: [[WIDE_VEC2:%.*]] = load <32 x float>, ptr [[TMP7]], align 4 ; INTERLEAVE-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <32 x float> [[WIDE_VEC2]], <32 x float> poison, <4 x i32> ; INTERLEAVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[OFFSET_IDX]] ; INTERLEAVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP5]] ; INTERLEAVE-NEXT: [[WIDE_VEC4:%.*]] = load <32 x float>, ptr [[TMP8]], align 4 -; INTERLEAVE-NEXT: [[WIDE_VEC5:%.*]] = load <32 x float>, ptr [[TMP9]], align 4 ; INTERLEAVE-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <32 x float> [[WIDE_VEC4]], <32 x float> poison, <4 x i32> +; INTERLEAVE-NEXT: [[WIDE_VEC5:%.*]] = load <32 x float>, ptr [[TMP9]], align 4 ; INTERLEAVE-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <32 x float> [[WIDE_VEC5]], <32 x float> poison, <4 x i32> ; INTERLEAVE-NEXT: [[TMP10:%.*]] = fadd fast <4 x float> [[VEC_PHI]], ; INTERLEAVE-NEXT: [[TMP11:%.*]] = fadd fast <4 x float> [[VEC_PHI1]], @@ -1487,8 +1487,8 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) { ; INTERLEAVE-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP7]], i32 1 ; INTERLEAVE-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP8]], i32 1 ; INTERLEAVE-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP9]], align 8 -; INTERLEAVE-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP13]], align 8 ; INTERLEAVE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> +; INTERLEAVE-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP13]], align 8 ; INTERLEAVE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> ; INTERLEAVE-NEXT: [[TMP17:%.*]] = xor <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] ; INTERLEAVE-NEXT: [[TMP18:%.*]] = xor <4 x i32> [[STRIDED_VEC2]], [[BROADCAST_SPLAT]] @@ -5250,30 +5250,30 @@ define i32 @PR32419(i32 %a, i16 %b) { ; UNROLL-NEXT: [[TMP8:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_UREM_IF]] ] ; UNROLL-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP2]], i64 1 ; UNROLL-NEXT: br i1 [[TMP9]], label [[PRED_UREM_IF3:%.*]], label [[PRED_UREM_CONTINUE4:%.*]] -; UNROLL: pred.urem.if3: +; UNROLL: pred.urem.if2: ; UNROLL-NEXT: [[TMP10:%.*]] = add i16 [[TMP1]], -19 ; UNROLL-NEXT: [[TMP11:%.*]] = urem i16 [[B]], [[TMP10]] ; UNROLL-NEXT: [[TMP12:%.*]] = insertelement <2 x i16> [[TMP8]], i16 [[TMP11]], i64 1 ; UNROLL-NEXT: br label [[PRED_UREM_CONTINUE4]] -; UNROLL: pred.urem.continue4: +; UNROLL: pred.urem.continue3: ; UNROLL-NEXT: [[TMP13:%.*]] = phi <2 x i16> [ [[TMP8]], [[PRED_UREM_CONTINUE]] ], [ [[TMP12]], [[PRED_UREM_IF3]] ] ; UNROLL-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP3]], i64 0 ; UNROLL-NEXT: br i1 [[TMP14]], label [[PRED_UREM_IF5:%.*]], label [[PRED_UREM_CONTINUE6:%.*]] -; UNROLL: pred.urem.if5: +; UNROLL: pred.urem.if4: ; UNROLL-NEXT: [[TMP15:%.*]] = add i16 [[TMP1]], -18 ; UNROLL-NEXT: [[TMP16:%.*]] = urem i16 [[B]], [[TMP15]] ; UNROLL-NEXT: [[TMP17:%.*]] = insertelement <2 x i16> poison, i16 [[TMP16]], i64 0 ; UNROLL-NEXT: br label [[PRED_UREM_CONTINUE6]] -; UNROLL: pred.urem.continue6: +; UNROLL: pred.urem.continue5: ; UNROLL-NEXT: [[TMP18:%.*]] = phi <2 x i16> [ poison, [[PRED_UREM_CONTINUE4]] ], [ [[TMP17]], [[PRED_UREM_IF5]] ] ; UNROLL-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP3]], i64 1 ; UNROLL-NEXT: br i1 [[TMP19]], label [[PRED_UREM_IF7:%.*]], label [[PRED_UREM_CONTINUE8]] -; UNROLL: pred.urem.if7: +; UNROLL: pred.urem.if6: ; UNROLL-NEXT: [[TMP20:%.*]] = add i16 [[TMP1]], -17 ; UNROLL-NEXT: [[TMP21:%.*]] = urem i16 [[B]], [[TMP20]] ; UNROLL-NEXT: [[TMP22:%.*]] = insertelement <2 x i16> [[TMP18]], i16 [[TMP21]], i64 1 ; UNROLL-NEXT: br label [[PRED_UREM_CONTINUE8]] -; UNROLL: pred.urem.continue8: +; UNROLL: pred.urem.continue7: ; UNROLL-NEXT: [[TMP23:%.*]] = phi <2 x i16> [ [[TMP18]], [[PRED_UREM_CONTINUE6]] ], [ [[TMP22]], [[PRED_UREM_IF7]] ] ; UNROLL-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i16> [[TMP13]], <2 x i16> zeroinitializer ; UNROLL-NEXT: [[PREDPHI9:%.*]] = select <2 x i1> [[TMP3]], <2 x i16> [[TMP23]], <2 x i16> zeroinitializer @@ -5330,30 +5330,30 @@ define i32 @PR32419(i32 %a, i16 %b) { ; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_UREM_IF]] ] ; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 ; UNROLL-NO-IC-NEXT: br i1 [[TMP11]], label [[PRED_UREM_IF3:%.*]], label [[PRED_UREM_CONTINUE4:%.*]] -; UNROLL-NO-IC: pred.urem.if3: +; UNROLL-NO-IC: pred.urem.if2: ; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = add i16 [[TMP1]], 1 ; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = urem i16 [[B]], [[TMP12]] ; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = insertelement <2 x i16> [[TMP10]], i16 [[TMP13]], i32 1 ; UNROLL-NO-IC-NEXT: br label [[PRED_UREM_CONTINUE4]] -; UNROLL-NO-IC: pred.urem.continue4: +; UNROLL-NO-IC: pred.urem.continue3: ; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = phi <2 x i16> [ [[TMP10]], [[PRED_UREM_CONTINUE]] ], [ [[TMP14]], [[PRED_UREM_IF3]] ] ; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 ; UNROLL-NO-IC-NEXT: br i1 [[TMP16]], label [[PRED_UREM_IF5:%.*]], label [[PRED_UREM_CONTINUE6:%.*]] -; UNROLL-NO-IC: pred.urem.if5: +; UNROLL-NO-IC: pred.urem.if4: ; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = add i16 [[TMP1]], 2 ; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = urem i16 [[B]], [[TMP17]] ; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = insertelement <2 x i16> poison, i16 [[TMP18]], i32 0 ; UNROLL-NO-IC-NEXT: br label [[PRED_UREM_CONTINUE6]] -; UNROLL-NO-IC: pred.urem.continue6: +; UNROLL-NO-IC: pred.urem.continue5: ; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = phi <2 x i16> [ poison, [[PRED_UREM_CONTINUE4]] ], [ [[TMP19]], [[PRED_UREM_IF5]] ] ; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 ; UNROLL-NO-IC-NEXT: br i1 [[TMP21]], label [[PRED_UREM_IF7:%.*]], label [[PRED_UREM_CONTINUE8]] -; UNROLL-NO-IC: pred.urem.if7: +; UNROLL-NO-IC: pred.urem.if6: ; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = add i16 [[TMP1]], 3 ; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = urem i16 [[B]], [[TMP22]] ; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = insertelement <2 x i16> [[TMP20]], i16 [[TMP23]], i32 1 ; UNROLL-NO-IC-NEXT: br label [[PRED_UREM_CONTINUE8]] -; UNROLL-NO-IC: pred.urem.continue8: +; UNROLL-NO-IC: pred.urem.continue7: ; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = phi <2 x i16> [ [[TMP20]], [[PRED_UREM_CONTINUE6]] ], [ [[TMP24]], [[PRED_UREM_IF7]] ] ; UNROLL-NO-IC-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i16> zeroinitializer, <2 x i16> [[TMP15]] ; UNROLL-NO-IC-NEXT: [[PREDPHI9:%.*]] = select <2 x i1> [[TMP3]], <2 x i16> zeroinitializer, <2 x i16> [[TMP25]] @@ -5418,66 +5418,66 @@ define i32 @PR32419(i32 %a, i16 %b) { ; INTERLEAVE-NEXT: [[TMP8:%.*]] = phi <4 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_UREM_IF]] ] ; INTERLEAVE-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1 ; INTERLEAVE-NEXT: br i1 [[TMP9]], label [[PRED_UREM_IF3:%.*]], label [[PRED_UREM_CONTINUE4:%.*]] -; INTERLEAVE: pred.urem.if3: +; INTERLEAVE: pred.urem.if2: ; INTERLEAVE-NEXT: [[TMP10:%.*]] = add i16 [[TMP1]], -19 ; INTERLEAVE-NEXT: [[TMP11:%.*]] = urem i16 [[B]], [[TMP10]] ; INTERLEAVE-NEXT: [[TMP12:%.*]] = insertelement <4 x i16> [[TMP8]], i16 [[TMP11]], i64 1 ; INTERLEAVE-NEXT: br label [[PRED_UREM_CONTINUE4]] -; INTERLEAVE: pred.urem.continue4: +; INTERLEAVE: pred.urem.continue3: ; INTERLEAVE-NEXT: [[TMP13:%.*]] = phi <4 x i16> [ [[TMP8]], [[PRED_UREM_CONTINUE]] ], [ [[TMP12]], [[PRED_UREM_IF3]] ] ; INTERLEAVE-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP2]], i64 2 ; INTERLEAVE-NEXT: br i1 [[TMP14]], label [[PRED_UREM_IF5:%.*]], label [[PRED_UREM_CONTINUE6:%.*]] -; INTERLEAVE: pred.urem.if5: +; INTERLEAVE: pred.urem.if4: ; INTERLEAVE-NEXT: [[TMP15:%.*]] = add i16 [[TMP1]], -18 ; INTERLEAVE-NEXT: [[TMP16:%.*]] = urem i16 [[B]], [[TMP15]] ; INTERLEAVE-NEXT: [[TMP17:%.*]] = insertelement <4 x i16> [[TMP13]], i16 [[TMP16]], i64 2 ; INTERLEAVE-NEXT: br label [[PRED_UREM_CONTINUE6]] -; INTERLEAVE: pred.urem.continue6: +; INTERLEAVE: pred.urem.continue5: ; INTERLEAVE-NEXT: [[TMP18:%.*]] = phi <4 x i16> [ [[TMP13]], [[PRED_UREM_CONTINUE4]] ], [ [[TMP17]], [[PRED_UREM_IF5]] ] ; INTERLEAVE-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP2]], i64 3 ; INTERLEAVE-NEXT: br i1 [[TMP19]], label [[PRED_UREM_IF7:%.*]], label [[PRED_UREM_CONTINUE8:%.*]] -; INTERLEAVE: pred.urem.if7: +; INTERLEAVE: pred.urem.if6: ; INTERLEAVE-NEXT: [[TMP20:%.*]] = add i16 [[TMP1]], -17 ; INTERLEAVE-NEXT: [[TMP21:%.*]] = urem i16 [[B]], [[TMP20]] ; INTERLEAVE-NEXT: [[TMP22:%.*]] = insertelement <4 x i16> [[TMP18]], i16 [[TMP21]], i64 3 ; INTERLEAVE-NEXT: br label [[PRED_UREM_CONTINUE8]] -; INTERLEAVE: pred.urem.continue8: +; INTERLEAVE: pred.urem.continue7: ; INTERLEAVE-NEXT: [[TMP23:%.*]] = phi <4 x i16> [ [[TMP18]], [[PRED_UREM_CONTINUE6]] ], [ [[TMP22]], [[PRED_UREM_IF7]] ] ; INTERLEAVE-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP3]], i64 0 ; INTERLEAVE-NEXT: br i1 [[TMP24]], label [[PRED_UREM_IF9:%.*]], label [[PRED_UREM_CONTINUE10:%.*]] -; INTERLEAVE: pred.urem.if9: +; INTERLEAVE: pred.urem.if8: ; INTERLEAVE-NEXT: [[TMP25:%.*]] = add i16 [[TMP1]], -16 ; INTERLEAVE-NEXT: [[TMP26:%.*]] = urem i16 [[B]], [[TMP25]] ; INTERLEAVE-NEXT: [[TMP27:%.*]] = insertelement <4 x i16> poison, i16 [[TMP26]], i64 0 ; INTERLEAVE-NEXT: br label [[PRED_UREM_CONTINUE10]] -; INTERLEAVE: pred.urem.continue10: +; INTERLEAVE: pred.urem.continue9: ; INTERLEAVE-NEXT: [[TMP28:%.*]] = phi <4 x i16> [ poison, [[PRED_UREM_CONTINUE8]] ], [ [[TMP27]], [[PRED_UREM_IF9]] ] ; INTERLEAVE-NEXT: [[TMP29:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1 ; INTERLEAVE-NEXT: br i1 [[TMP29]], label [[PRED_UREM_IF11:%.*]], label [[PRED_UREM_CONTINUE12:%.*]] -; INTERLEAVE: pred.urem.if11: +; INTERLEAVE: pred.urem.if10: ; INTERLEAVE-NEXT: [[TMP30:%.*]] = add i16 [[TMP1]], -15 ; INTERLEAVE-NEXT: [[TMP31:%.*]] = urem i16 [[B]], [[TMP30]] ; INTERLEAVE-NEXT: [[TMP32:%.*]] = insertelement <4 x i16> [[TMP28]], i16 [[TMP31]], i64 1 ; INTERLEAVE-NEXT: br label [[PRED_UREM_CONTINUE12]] -; INTERLEAVE: pred.urem.continue12: +; INTERLEAVE: pred.urem.continue11: ; INTERLEAVE-NEXT: [[TMP33:%.*]] = phi <4 x i16> [ [[TMP28]], [[PRED_UREM_CONTINUE10]] ], [ [[TMP32]], [[PRED_UREM_IF11]] ] ; INTERLEAVE-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP3]], i64 2 ; INTERLEAVE-NEXT: br i1 [[TMP34]], label [[PRED_UREM_IF13:%.*]], label [[PRED_UREM_CONTINUE14:%.*]] -; INTERLEAVE: pred.urem.if13: +; INTERLEAVE: pred.urem.if12: ; INTERLEAVE-NEXT: [[TMP35:%.*]] = add i16 [[TMP1]], -14 ; INTERLEAVE-NEXT: [[TMP36:%.*]] = urem i16 [[B]], [[TMP35]] ; INTERLEAVE-NEXT: [[TMP37:%.*]] = insertelement <4 x i16> [[TMP33]], i16 [[TMP36]], i64 2 ; INTERLEAVE-NEXT: br label [[PRED_UREM_CONTINUE14]] -; INTERLEAVE: pred.urem.continue14: +; INTERLEAVE: pred.urem.continue13: ; INTERLEAVE-NEXT: [[TMP38:%.*]] = phi <4 x i16> [ [[TMP33]], [[PRED_UREM_CONTINUE12]] ], [ [[TMP37]], [[PRED_UREM_IF13]] ] ; INTERLEAVE-NEXT: [[TMP39:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3 ; INTERLEAVE-NEXT: br i1 [[TMP39]], label [[PRED_UREM_IF15:%.*]], label [[PRED_UREM_CONTINUE16]] -; INTERLEAVE: pred.urem.if15: +; INTERLEAVE: pred.urem.if14: ; INTERLEAVE-NEXT: [[TMP40:%.*]] = add i16 [[TMP1]], -13 ; INTERLEAVE-NEXT: [[TMP41:%.*]] = urem i16 [[B]], [[TMP40]] ; INTERLEAVE-NEXT: [[TMP42:%.*]] = insertelement <4 x i16> [[TMP38]], i16 [[TMP41]], i64 3 ; INTERLEAVE-NEXT: br label [[PRED_UREM_CONTINUE16]] -; INTERLEAVE: pred.urem.continue16: +; INTERLEAVE: pred.urem.continue15: ; INTERLEAVE-NEXT: [[TMP43:%.*]] = phi <4 x i16> [ [[TMP38]], [[PRED_UREM_CONTINUE14]] ], [ [[TMP42]], [[PRED_UREM_IF15]] ] ; INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i16> [[TMP23]], <4 x i16> zeroinitializer ; INTERLEAVE-NEXT: [[PREDPHI17:%.*]] = select <4 x i1> [[TMP3]], <4 x i16> [[TMP43]], <4 x i16> zeroinitializer @@ -6379,12 +6379,12 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; UNROLL-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -4 ; UNROLL-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 ; UNROLL-NEXT: [[IND_END:%.*]] = mul i32 [[STEP]], [[DOTCAST]] +; UNROLL-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i64 0 +; UNROLL-NEXT: [[TMP16:%.*]] = shl <2 x i32> [[DOTSPLATINSERT2]], +; UNROLL-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <2 x i32> [[TMP16]], <2 x i32> poison, <2 x i32> zeroinitializer ; UNROLL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i64 0 ; UNROLL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; UNROLL-NEXT: [[TMP15:%.*]] = mul nuw <2 x i32> [[DOTSPLAT]], -; UNROLL-NEXT: [[TMP16:%.*]] = shl i32 [[STEP]], 1 -; UNROLL-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP16]], i64 0 -; UNROLL-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT2]], <2 x i32> poison, <2 x i32> zeroinitializer ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: ; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -6457,13 +6457,13 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; UNROLL-NO-IC-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 ; UNROLL-NO-IC-NEXT: [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]] +; UNROLL-NO-IC-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i64 0 +; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT2]], <2 x i32> poison, <2 x i32> zeroinitializer +; UNROLL-NO-IC-NEXT: [[DOTSPLAT3:%.*]] = mul <2 x i32> , [[TMP18]] ; UNROLL-NO-IC-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i64 0 ; UNROLL-NO-IC-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = mul <2 x i32> , [[DOTSPLAT]] ; UNROLL-NO-IC-NEXT: [[INDUCTION:%.*]] = add <2 x i32> zeroinitializer, [[TMP17]] -; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = mul i32 [[STEP]], 2 -; UNROLL-NO-IC-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i64 0 -; UNROLL-NO-IC-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT2]], <2 x i32> poison, <2 x i32> zeroinitializer ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -6537,12 +6537,12 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; INTERLEAVE-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -8 ; INTERLEAVE-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 ; INTERLEAVE-NEXT: [[IND_END:%.*]] = mul i32 [[STEP]], [[DOTCAST]] +; INTERLEAVE-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[STEP]], i64 0 +; INTERLEAVE-NEXT: [[TMP16:%.*]] = shl <4 x i32> [[DOTSPLATINSERT2]], +; INTERLEAVE-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x i32> [[TMP16]], <4 x i32> poison, <4 x i32> zeroinitializer ; INTERLEAVE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[STEP]], i64 0 ; INTERLEAVE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; INTERLEAVE-NEXT: [[TMP15:%.*]] = mul <4 x i32> [[DOTSPLAT]], -; INTERLEAVE-NEXT: [[TMP16:%.*]] = shl i32 [[STEP]], 2 -; INTERLEAVE-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP16]], i64 0 -; INTERLEAVE-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll index 2503520c0ff9d..d80c5aed3ea2d 100644 --- a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll +++ b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll @@ -230,7 +230,6 @@ define void @first_order_recurrence_using_induction(i32 %n, ptr %dst) { ; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[INDEX]] to i32 ; CHECK-NEXT: [[INDUCTION:%.*]] = add i32 [[TMP3]], 0 ; CHECK-NEXT: [[INDUCTION1]] = add i32 [[TMP3]], 1 -; CHECK-NEXT: store i32 [[VECTOR_RECUR]], ptr [[DST:%.*]], align 4 ; CHECK-NEXT: store i32 [[INDUCTION]], ptr [[DST]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], %n.vec @@ -304,7 +303,6 @@ define void @scalarize_ptrtoint(ptr %src, ptr %dst) { ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP7]], 10 ; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP8]] to ptr ; CHECK-NEXT: [[TMP11:%.*]] = inttoptr i64 [[TMP9]] to ptr -; CHECK-NEXT: store ptr [[TMP10]], ptr %dst, align 8 ; CHECK-NEXT: store ptr [[TMP11]], ptr %dst, align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 diff --git a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll index 7c23b603b6e91..dc3480fbb11a8 100644 --- a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll +++ b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll @@ -92,28 +92,28 @@ define void @pr45679(ptr %A) optsize { ; VF2UF2: pred.store.continue: ; VF2UF2-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1 ; VF2UF2-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3:%.*]] -; VF2UF2: pred.store.if2: +; VF2UF2: pred.store.if1: ; VF2UF2-NEXT: [[TMP6:%.*]] = add i32 [[INDEX]], 1 ; VF2UF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP6]] ; VF2UF2-NEXT: store i32 13, ptr [[TMP7]], align 1 ; VF2UF2-NEXT: br label [[PRED_STORE_CONTINUE3]] -; VF2UF2: pred.store.continue3: +; VF2UF2: pred.store.continue2: ; VF2UF2-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 ; VF2UF2-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]] -; VF2UF2: pred.store.if4: +; VF2UF2: pred.store.if3: ; VF2UF2-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], 2 ; VF2UF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP9]] ; VF2UF2-NEXT: store i32 13, ptr [[TMP10]], align 1 ; VF2UF2-NEXT: br label [[PRED_STORE_CONTINUE5]] -; VF2UF2: pred.store.continue5: +; VF2UF2: pred.store.continue4: ; VF2UF2-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 ; VF2UF2-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7]] -; VF2UF2: pred.store.if6: +; VF2UF2: pred.store.if5: ; VF2UF2-NEXT: [[TMP12:%.*]] = add i32 [[INDEX]], 3 ; VF2UF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP12]] ; VF2UF2-NEXT: store i32 13, ptr [[TMP13]], align 1 ; VF2UF2-NEXT: br label [[PRED_STORE_CONTINUE7]] -; VF2UF2: pred.store.continue7: +; VF2UF2: pred.store.continue6: ; VF2UF2-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; VF2UF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], ; VF2UF2-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 @@ -293,31 +293,31 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) { ; VF2UF2: pred.store.continue: ; VF2UF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1 ; VF2UF2-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3:%.*]] -; VF2UF2: pred.store.if2: +; VF2UF2: pred.store.if1: ; VF2UF2-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 1 ; VF2UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] ; VF2UF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP9]], align 8 ; VF2UF2-NEXT: store i64 [[TMP10]], ptr [[B]], align 8 ; VF2UF2-NEXT: br label [[PRED_STORE_CONTINUE3]] -; VF2UF2: pred.store.continue3: +; VF2UF2: pred.store.continue2: ; VF2UF2-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 ; VF2UF2-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]] -; VF2UF2: pred.store.if4: +; VF2UF2: pred.store.if3: ; VF2UF2-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 2 ; VF2UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]] ; VF2UF2-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP14]], align 8 ; VF2UF2-NEXT: store i64 [[TMP15]], ptr [[B]], align 8 ; VF2UF2-NEXT: br label [[PRED_STORE_CONTINUE5]] -; VF2UF2: pred.store.continue5: +; VF2UF2: pred.store.continue4: ; VF2UF2-NEXT: [[TMP17:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 ; VF2UF2-NEXT: br i1 [[TMP17]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7]] -; VF2UF2: pred.store.if6: +; VF2UF2: pred.store.if5: ; VF2UF2-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 3 ; VF2UF2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP18]] ; VF2UF2-NEXT: [[TMP20:%.*]] = load i64, ptr [[TMP19]], align 8 ; VF2UF2-NEXT: store i64 [[TMP20]], ptr [[B]], align 8 ; VF2UF2-NEXT: br label [[PRED_STORE_CONTINUE7]] -; VF2UF2: pred.store.continue7: +; VF2UF2: pred.store.continue6: ; VF2UF2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; VF2UF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], ; VF2UF2-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll index 119571d1c39cc..99a0b70803588 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll @@ -97,153 +97,153 @@ define i32 @predicated(ptr noalias nocapture %A) { ; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP0]], i64 1 ; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] -; CHECK: pred.load.if7: +; CHECK: pred.load.if4: ; CHECK-NEXT: [[TMP10:%.*]] = or disjoint i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4 ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP12]], i64 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE8]] -; CHECK: pred.load.continue8: +; CHECK: pred.load.continue5: ; CHECK-NEXT: [[TMP14:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF7]] ] ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP0]], i64 2 ; CHECK-NEXT: br i1 [[TMP15]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] -; CHECK: pred.load.if9: +; CHECK: pred.load.if6: ; CHECK-NEXT: [[TMP16:%.*]] = or disjoint i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]] ; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP18]], i64 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE10]] -; CHECK: pred.load.continue10: +; CHECK: pred.load.continue7: ; CHECK-NEXT: [[TMP20:%.*]] = phi <4 x i32> [ [[TMP14]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP19]], [[PRED_LOAD_IF9]] ] ; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP0]], i64 3 ; CHECK-NEXT: br i1 [[TMP21]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] -; CHECK: pred.load.if11: +; CHECK: pred.load.if8: ; CHECK-NEXT: [[TMP22:%.*]] = or disjoint i64 [[INDEX]], 3 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP22]] ; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4 ; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP24]], i64 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE12]] -; CHECK: pred.load.continue12: +; CHECK: pred.load.continue9: ; CHECK-NEXT: [[TMP26:%.*]] = phi <4 x i32> [ [[TMP20]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP25]], [[PRED_LOAD_IF11]] ] ; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i1> [[TMP1]], i64 0 ; CHECK-NEXT: br i1 [[TMP27]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] -; CHECK: pred.load.if13: +; CHECK: pred.load.if10: ; CHECK-NEXT: [[TMP28:%.*]] = or disjoint i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP28]] ; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP29]], align 4 ; CHECK-NEXT: [[TMP31:%.*]] = insertelement <4 x i32> poison, i32 [[TMP30]], i64 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE14]] -; CHECK: pred.load.continue14: +; CHECK: pred.load.continue11: ; CHECK-NEXT: [[TMP32:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE12]] ], [ [[TMP31]], [[PRED_LOAD_IF13]] ] ; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i1> [[TMP1]], i64 1 ; CHECK-NEXT: br i1 [[TMP33]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] -; CHECK: pred.load.if15: +; CHECK: pred.load.if12: ; CHECK-NEXT: [[TMP34:%.*]] = or disjoint i64 [[INDEX]], 5 ; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP34]] ; CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4 ; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP36]], i64 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE16]] -; CHECK: pred.load.continue16: +; CHECK: pred.load.continue13: ; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP32]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP37]], [[PRED_LOAD_IF15]] ] ; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i1> [[TMP1]], i64 2 ; CHECK-NEXT: br i1 [[TMP39]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] -; CHECK: pred.load.if17: +; CHECK: pred.load.if14: ; CHECK-NEXT: [[TMP40:%.*]] = or disjoint i64 [[INDEX]], 6 ; CHECK-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP40]] ; CHECK-NEXT: [[TMP42:%.*]] = load i32, ptr [[TMP41]], align 4 ; CHECK-NEXT: [[TMP43:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP42]], i64 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE18]] -; CHECK: pred.load.continue18: +; CHECK: pred.load.continue15: ; CHECK-NEXT: [[TMP44:%.*]] = phi <4 x i32> [ [[TMP38]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP43]], [[PRED_LOAD_IF17]] ] ; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i1> [[TMP1]], i64 3 ; CHECK-NEXT: br i1 [[TMP45]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] -; CHECK: pred.load.if19: +; CHECK: pred.load.if16: ; CHECK-NEXT: [[TMP46:%.*]] = or disjoint i64 [[INDEX]], 7 ; CHECK-NEXT: [[TMP47:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP46]] ; CHECK-NEXT: [[TMP48:%.*]] = load i32, ptr [[TMP47]], align 4 ; CHECK-NEXT: [[TMP49:%.*]] = insertelement <4 x i32> [[TMP44]], i32 [[TMP48]], i64 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE20]] -; CHECK: pred.load.continue20: +; CHECK: pred.load.continue17: ; CHECK-NEXT: [[TMP50:%.*]] = phi <4 x i32> [ [[TMP44]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP49]], [[PRED_LOAD_IF19]] ] ; CHECK-NEXT: [[TMP51:%.*]] = extractelement <4 x i1> [[TMP2]], i64 0 ; CHECK-NEXT: br i1 [[TMP51]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] -; CHECK: pred.load.if21: +; CHECK: pred.load.if18: ; CHECK-NEXT: [[TMP52:%.*]] = or disjoint i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP52]] ; CHECK-NEXT: [[TMP54:%.*]] = load i32, ptr [[TMP53]], align 4 ; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i32> poison, i32 [[TMP54]], i64 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE22]] -; CHECK: pred.load.continue22: +; CHECK: pred.load.continue19: ; CHECK-NEXT: [[TMP56:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE20]] ], [ [[TMP55]], [[PRED_LOAD_IF21]] ] ; CHECK-NEXT: [[TMP57:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1 ; CHECK-NEXT: br i1 [[TMP57]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] -; CHECK: pred.load.if23: +; CHECK: pred.load.if20: ; CHECK-NEXT: [[TMP58:%.*]] = or disjoint i64 [[INDEX]], 9 ; CHECK-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP58]] ; CHECK-NEXT: [[TMP60:%.*]] = load i32, ptr [[TMP59]], align 4 ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i32> [[TMP56]], i32 [[TMP60]], i64 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE24]] -; CHECK: pred.load.continue24: +; CHECK: pred.load.continue21: ; CHECK-NEXT: [[TMP62:%.*]] = phi <4 x i32> [ [[TMP56]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP61]], [[PRED_LOAD_IF23]] ] ; CHECK-NEXT: [[TMP63:%.*]] = extractelement <4 x i1> [[TMP2]], i64 2 ; CHECK-NEXT: br i1 [[TMP63]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] -; CHECK: pred.load.if25: +; CHECK: pred.load.if22: ; CHECK-NEXT: [[TMP64:%.*]] = or disjoint i64 [[INDEX]], 10 ; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP64]] ; CHECK-NEXT: [[TMP66:%.*]] = load i32, ptr [[TMP65]], align 4 ; CHECK-NEXT: [[TMP67:%.*]] = insertelement <4 x i32> [[TMP62]], i32 [[TMP66]], i64 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE26]] -; CHECK: pred.load.continue26: +; CHECK: pred.load.continue23: ; CHECK-NEXT: [[TMP68:%.*]] = phi <4 x i32> [ [[TMP62]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP67]], [[PRED_LOAD_IF25]] ] ; CHECK-NEXT: [[TMP69:%.*]] = extractelement <4 x i1> [[TMP2]], i64 3 ; CHECK-NEXT: br i1 [[TMP69]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] -; CHECK: pred.load.if27: +; CHECK: pred.load.if24: ; CHECK-NEXT: [[TMP70:%.*]] = or disjoint i64 [[INDEX]], 11 ; CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP70]] ; CHECK-NEXT: [[TMP72:%.*]] = load i32, ptr [[TMP71]], align 4 ; CHECK-NEXT: [[TMP73:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP72]], i64 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE28]] -; CHECK: pred.load.continue28: +; CHECK: pred.load.continue25: ; CHECK-NEXT: [[TMP74:%.*]] = phi <4 x i32> [ [[TMP68]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP73]], [[PRED_LOAD_IF27]] ] ; CHECK-NEXT: [[TMP75:%.*]] = extractelement <4 x i1> [[TMP3]], i64 0 ; CHECK-NEXT: br i1 [[TMP75]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] -; CHECK: pred.load.if29: +; CHECK: pred.load.if26: ; CHECK-NEXT: [[TMP76:%.*]] = or disjoint i64 [[INDEX]], 12 ; CHECK-NEXT: [[TMP77:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP76]] ; CHECK-NEXT: [[TMP78:%.*]] = load i32, ptr [[TMP77]], align 4 ; CHECK-NEXT: [[TMP79:%.*]] = insertelement <4 x i32> poison, i32 [[TMP78]], i64 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE30]] -; CHECK: pred.load.continue30: +; CHECK: pred.load.continue27: ; CHECK-NEXT: [[TMP80:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE28]] ], [ [[TMP79]], [[PRED_LOAD_IF29]] ] ; CHECK-NEXT: [[TMP81:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1 ; CHECK-NEXT: br i1 [[TMP81]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] -; CHECK: pred.load.if31: +; CHECK: pred.load.if28: ; CHECK-NEXT: [[TMP82:%.*]] = or disjoint i64 [[INDEX]], 13 ; CHECK-NEXT: [[TMP83:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP82]] ; CHECK-NEXT: [[TMP84:%.*]] = load i32, ptr [[TMP83]], align 4 ; CHECK-NEXT: [[TMP85:%.*]] = insertelement <4 x i32> [[TMP80]], i32 [[TMP84]], i64 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE32]] -; CHECK: pred.load.continue32: +; CHECK: pred.load.continue29: ; CHECK-NEXT: [[TMP86:%.*]] = phi <4 x i32> [ [[TMP80]], [[PRED_LOAD_CONTINUE30]] ], [ [[TMP85]], [[PRED_LOAD_IF31]] ] ; CHECK-NEXT: [[TMP87:%.*]] = extractelement <4 x i1> [[TMP3]], i64 2 ; CHECK-NEXT: br i1 [[TMP87]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]] -; CHECK: pred.load.if33: +; CHECK: pred.load.if30: ; CHECK-NEXT: [[TMP88:%.*]] = or disjoint i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP89:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP88]] ; CHECK-NEXT: [[TMP90:%.*]] = load i32, ptr [[TMP89]], align 4 ; CHECK-NEXT: [[TMP91:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP90]], i64 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE34]] -; CHECK: pred.load.continue34: +; CHECK: pred.load.continue31: ; CHECK-NEXT: [[TMP92:%.*]] = phi <4 x i32> [ [[TMP86]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP91]], [[PRED_LOAD_IF33]] ] ; CHECK-NEXT: [[TMP93:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3 ; CHECK-NEXT: br i1 [[TMP93]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36]] -; CHECK: pred.load.if35: +; CHECK: pred.load.if32: ; CHECK-NEXT: [[TMP94:%.*]] = or disjoint i64 [[INDEX]], 15 ; CHECK-NEXT: [[TMP95:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP94]] ; CHECK-NEXT: [[TMP96:%.*]] = load i32, ptr [[TMP95]], align 4 ; CHECK-NEXT: [[TMP97:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP96]], i64 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE36]] -; CHECK: pred.load.continue36: +; CHECK: pred.load.continue33: ; CHECK-NEXT: [[TMP98:%.*]] = phi <4 x i32> [ [[TMP92]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP97]], [[PRED_LOAD_IF35]] ] ; CHECK-NEXT: [[TMP99:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP26]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP100:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP99]]) @@ -302,11 +302,8 @@ define i32 @cond_rdx_pred(i32 %cond, ptr noalias %a, i64 %N) { ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N_RND_UP]], -16 ; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i64 [[N]], -1 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i32> poison, i32 [[COND:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT7]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLAT8]], -; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLAT8]], -; CHECK-NEXT: [[TMP6:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLAT8]], -; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLAT8]], +; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLATINSERT7]], +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i1> [[TMP4]], <4 x i1> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -324,10 +321,10 @@ define i32 @cond_rdx_pred(i32 %cond, ptr noalias %a, i64 %N) { ; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <4 x i64> [[STEP_ADD1]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ule <4 x i64> [[STEP_ADD2]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP0]], <4 x i1> [[TMP4]], <4 x i1> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[TMP6]], <4 x i1> zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP3]], <4 x i1> [[TMP7]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP0]], <4 x i1> [[BROADCAST_SPLAT8]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[BROADCAST_SPLAT8]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[BROADCAST_SPLAT8]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP3]], <4 x i1> [[BROADCAST_SPLAT8]], <4 x i1> zeroinitializer ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP8]], i64 0 ; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: @@ -339,153 +336,153 @@ define i32 @cond_rdx_pred(i32 %cond, ptr noalias %a, i64 %N) { ; CHECK-NEXT: [[TMP16:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP15]], [[PRED_LOAD_IF]] ] ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i1> [[TMP8]], i64 1 ; CHECK-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] -; CHECK: pred.load.if9: +; CHECK: pred.load.if6: ; CHECK-NEXT: [[TMP18:%.*]] = or disjoint i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]] ; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4 ; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP20]], i64 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE10]] -; CHECK: pred.load.continue10: +; CHECK: pred.load.continue7: ; CHECK-NEXT: [[TMP22:%.*]] = phi <4 x i32> [ [[TMP16]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP21]], [[PRED_LOAD_IF9]] ] ; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[TMP8]], i64 2 ; CHECK-NEXT: br i1 [[TMP23]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] -; CHECK: pred.load.if11: +; CHECK: pred.load.if8: ; CHECK-NEXT: [[TMP24:%.*]] = or disjoint i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP24]] ; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4 ; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP26]], i64 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE12]] -; CHECK: pred.load.continue12: +; CHECK: pred.load.continue9: ; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ [[TMP22]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP27]], [[PRED_LOAD_IF11]] ] ; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i1> [[TMP8]], i64 3 ; CHECK-NEXT: br i1 [[TMP29]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] -; CHECK: pred.load.if13: +; CHECK: pred.load.if10: ; CHECK-NEXT: [[TMP30:%.*]] = or disjoint i64 [[INDEX]], 3 ; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP30]] ; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4 ; CHECK-NEXT: [[TMP33:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP32]], i64 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE14]] -; CHECK: pred.load.continue14: +; CHECK: pred.load.continue11: ; CHECK-NEXT: [[TMP34:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP33]], [[PRED_LOAD_IF13]] ] ; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x i1> [[TMP9]], i64 0 ; CHECK-NEXT: br i1 [[TMP35]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] -; CHECK: pred.load.if15: +; CHECK: pred.load.if12: ; CHECK-NEXT: [[TMP36:%.*]] = or disjoint i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP36]] ; CHECK-NEXT: [[TMP38:%.*]] = load i32, ptr [[TMP37]], align 4 ; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i32> poison, i32 [[TMP38]], i64 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE16]] -; CHECK: pred.load.continue16: +; CHECK: pred.load.continue13: ; CHECK-NEXT: [[TMP40:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE14]] ], [ [[TMP39]], [[PRED_LOAD_IF15]] ] ; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i1> [[TMP9]], i64 1 ; CHECK-NEXT: br i1 [[TMP41]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] -; CHECK: pred.load.if17: +; CHECK: pred.load.if14: ; CHECK-NEXT: [[TMP42:%.*]] = or disjoint i64 [[INDEX]], 5 ; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP42]] ; CHECK-NEXT: [[TMP44:%.*]] = load i32, ptr [[TMP43]], align 4 ; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i32> [[TMP40]], i32 [[TMP44]], i64 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE18]] -; CHECK: pred.load.continue18: +; CHECK: pred.load.continue15: ; CHECK-NEXT: [[TMP46:%.*]] = phi <4 x i32> [ [[TMP40]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP45]], [[PRED_LOAD_IF17]] ] ; CHECK-NEXT: [[TMP47:%.*]] = extractelement <4 x i1> [[TMP9]], i64 2 ; CHECK-NEXT: br i1 [[TMP47]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] -; CHECK: pred.load.if19: +; CHECK: pred.load.if16: ; CHECK-NEXT: [[TMP48:%.*]] = or disjoint i64 [[INDEX]], 6 ; CHECK-NEXT: [[TMP49:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP48]] ; CHECK-NEXT: [[TMP50:%.*]] = load i32, ptr [[TMP49]], align 4 ; CHECK-NEXT: [[TMP51:%.*]] = insertelement <4 x i32> [[TMP46]], i32 [[TMP50]], i64 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE20]] -; CHECK: pred.load.continue20: +; CHECK: pred.load.continue17: ; CHECK-NEXT: [[TMP52:%.*]] = phi <4 x i32> [ [[TMP46]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP51]], [[PRED_LOAD_IF19]] ] ; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i1> [[TMP9]], i64 3 ; CHECK-NEXT: br i1 [[TMP53]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] -; CHECK: pred.load.if21: +; CHECK: pred.load.if18: ; CHECK-NEXT: [[TMP54:%.*]] = or disjoint i64 [[INDEX]], 7 ; CHECK-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP54]] ; CHECK-NEXT: [[TMP56:%.*]] = load i32, ptr [[TMP55]], align 4 ; CHECK-NEXT: [[TMP57:%.*]] = insertelement <4 x i32> [[TMP52]], i32 [[TMP56]], i64 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE22]] -; CHECK: pred.load.continue22: +; CHECK: pred.load.continue19: ; CHECK-NEXT: [[TMP58:%.*]] = phi <4 x i32> [ [[TMP52]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP57]], [[PRED_LOAD_IF21]] ] ; CHECK-NEXT: [[TMP59:%.*]] = extractelement <4 x i1> [[TMP10]], i64 0 ; CHECK-NEXT: br i1 [[TMP59]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] -; CHECK: pred.load.if23: +; CHECK: pred.load.if20: ; CHECK-NEXT: [[TMP60:%.*]] = or disjoint i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP60]] ; CHECK-NEXT: [[TMP62:%.*]] = load i32, ptr [[TMP61]], align 4 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i32> poison, i32 [[TMP62]], i64 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE24]] -; CHECK: pred.load.continue24: +; CHECK: pred.load.continue21: ; CHECK-NEXT: [[TMP64:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE22]] ], [ [[TMP63]], [[PRED_LOAD_IF23]] ] ; CHECK-NEXT: [[TMP65:%.*]] = extractelement <4 x i1> [[TMP10]], i64 1 ; CHECK-NEXT: br i1 [[TMP65]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] -; CHECK: pred.load.if25: +; CHECK: pred.load.if22: ; CHECK-NEXT: [[TMP66:%.*]] = or disjoint i64 [[INDEX]], 9 ; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP66]] ; CHECK-NEXT: [[TMP68:%.*]] = load i32, ptr [[TMP67]], align 4 ; CHECK-NEXT: [[TMP69:%.*]] = insertelement <4 x i32> [[TMP64]], i32 [[TMP68]], i64 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE26]] -; CHECK: pred.load.continue26: +; CHECK: pred.load.continue23: ; CHECK-NEXT: [[TMP70:%.*]] = phi <4 x i32> [ [[TMP64]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP69]], [[PRED_LOAD_IF25]] ] ; CHECK-NEXT: [[TMP71:%.*]] = extractelement <4 x i1> [[TMP10]], i64 2 ; CHECK-NEXT: br i1 [[TMP71]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] -; CHECK: pred.load.if27: +; CHECK: pred.load.if24: ; CHECK-NEXT: [[TMP72:%.*]] = or disjoint i64 [[INDEX]], 10 ; CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP72]] ; CHECK-NEXT: [[TMP74:%.*]] = load i32, ptr [[TMP73]], align 4 ; CHECK-NEXT: [[TMP75:%.*]] = insertelement <4 x i32> [[TMP70]], i32 [[TMP74]], i64 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE28]] -; CHECK: pred.load.continue28: +; CHECK: pred.load.continue25: ; CHECK-NEXT: [[TMP76:%.*]] = phi <4 x i32> [ [[TMP70]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP75]], [[PRED_LOAD_IF27]] ] ; CHECK-NEXT: [[TMP77:%.*]] = extractelement <4 x i1> [[TMP10]], i64 3 ; CHECK-NEXT: br i1 [[TMP77]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] -; CHECK: pred.load.if29: +; CHECK: pred.load.if26: ; CHECK-NEXT: [[TMP78:%.*]] = or disjoint i64 [[INDEX]], 11 ; CHECK-NEXT: [[TMP79:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP78]] ; CHECK-NEXT: [[TMP80:%.*]] = load i32, ptr [[TMP79]], align 4 ; CHECK-NEXT: [[TMP81:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP80]], i64 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE30]] -; CHECK: pred.load.continue30: +; CHECK: pred.load.continue27: ; CHECK-NEXT: [[TMP82:%.*]] = phi <4 x i32> [ [[TMP76]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP81]], [[PRED_LOAD_IF29]] ] ; CHECK-NEXT: [[TMP83:%.*]] = extractelement <4 x i1> [[TMP11]], i64 0 ; CHECK-NEXT: br i1 [[TMP83]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] -; CHECK: pred.load.if31: +; CHECK: pred.load.if28: ; CHECK-NEXT: [[TMP84:%.*]] = or disjoint i64 [[INDEX]], 12 ; CHECK-NEXT: [[TMP85:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP84]] ; CHECK-NEXT: [[TMP86:%.*]] = load i32, ptr [[TMP85]], align 4 ; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x i32> poison, i32 [[TMP86]], i64 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE32]] -; CHECK: pred.load.continue32: +; CHECK: pred.load.continue29: ; CHECK-NEXT: [[TMP88:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP87]], [[PRED_LOAD_IF31]] ] ; CHECK-NEXT: [[TMP89:%.*]] = extractelement <4 x i1> [[TMP11]], i64 1 ; CHECK-NEXT: br i1 [[TMP89]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]] -; CHECK: pred.load.if33: +; CHECK: pred.load.if30: ; CHECK-NEXT: [[TMP90:%.*]] = or disjoint i64 [[INDEX]], 13 ; CHECK-NEXT: [[TMP91:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP90]] ; CHECK-NEXT: [[TMP92:%.*]] = load i32, ptr [[TMP91]], align 4 ; CHECK-NEXT: [[TMP93:%.*]] = insertelement <4 x i32> [[TMP88]], i32 [[TMP92]], i64 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE34]] -; CHECK: pred.load.continue34: +; CHECK: pred.load.continue31: ; CHECK-NEXT: [[TMP94:%.*]] = phi <4 x i32> [ [[TMP88]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP93]], [[PRED_LOAD_IF33]] ] ; CHECK-NEXT: [[TMP95:%.*]] = extractelement <4 x i1> [[TMP11]], i64 2 ; CHECK-NEXT: br i1 [[TMP95]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]] -; CHECK: pred.load.if35: +; CHECK: pred.load.if32: ; CHECK-NEXT: [[TMP96:%.*]] = or disjoint i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP97:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP96]] ; CHECK-NEXT: [[TMP98:%.*]] = load i32, ptr [[TMP97]], align 4 ; CHECK-NEXT: [[TMP99:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP98]], i64 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE36]] -; CHECK: pred.load.continue36: +; CHECK: pred.load.continue33: ; CHECK-NEXT: [[TMP100:%.*]] = phi <4 x i32> [ [[TMP94]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP99]], [[PRED_LOAD_IF35]] ] ; CHECK-NEXT: [[TMP101:%.*]] = extractelement <4 x i1> [[TMP11]], i64 3 ; CHECK-NEXT: br i1 [[TMP101]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38]] -; CHECK: pred.load.if37: +; CHECK: pred.load.if34: ; CHECK-NEXT: [[TMP102:%.*]] = or disjoint i64 [[INDEX]], 15 ; CHECK-NEXT: [[TMP103:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP102]] ; CHECK-NEXT: [[TMP104:%.*]] = load i32, ptr [[TMP103]], align 4 ; CHECK-NEXT: [[TMP105:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP104]], i64 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE38]] -; CHECK: pred.load.continue38: +; CHECK: pred.load.continue35: ; CHECK-NEXT: [[TMP106:%.*]] = phi <4 x i32> [ [[TMP100]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP105]], [[PRED_LOAD_IF37]] ] ; CHECK-NEXT: [[TMP107:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP34]], <4 x i32> ; CHECK-NEXT: [[TMP108:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP107]]) diff --git a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll index 60827160dcb32..a1e28999a4002 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll @@ -2,7 +2,7 @@ ; RUN: opt -p loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -debug -disable-output %s 2>&1 | FileCheck %s define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) { -; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Final VPlan for VF={2},UF={1}' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll index 431d14be45857..c17e7068278d0 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll @@ -66,13 +66,19 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> -; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> -; CHECK-NEXT: EMIT vp<[[PADD:%.+]]> = ptradd ir<%A>, vp<[[STEPS]]> -; CHECK-NEXT: vp<[[VPTR:%.]]> = vector-pointer vp<[[PADD]]> -; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VPTR]]> +; CHECK-NEXT: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> +; CHECK-NEXT: vp<[[STEPS2:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, ir<1> +; CHECK-NEXT: EMIT vp<[[PADD1:%.+]]> = ptradd ir<%A>, vp<[[STEPS1]]> +; CHECK-NEXT: vp<[[VPTR1:%.]]> = vector-pointer vp<[[PADD1]]> +; CHECK-NEXT: vp<[[VPTR2:%.]]> = vector-pointer vp<[[PADD1]]>, ir<1> +; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VPTR1]]> +; CHECK-NEXT: WIDEN ir<%l>.1 = load vp<[[VPTR2]]> ; CHECK-NEXT: WIDEN ir<%add> = add nsw ir<%l>, ir<10> -; CHECK-NEXT: vp<[[VPTR2:%.+]]> = vector-pointer vp<[[PADD]]> -; CHECK-NEXT: WIDEN store vp<[[VPTR2]]>, ir<%add> +; CHECK-NEXT: WIDEN ir<%add>.1 = add nsw ir<%l>.1, ir<10> +; CHECK-NEXT: vp<[[VPTR3:%.+]]> = vector-pointer vp<[[PADD1]]> +; CHECK-NEXT: vp<[[VPTR4:%.+]]> = vector-pointer vp<[[PADD1]]>, ir<1> +; CHECK-NEXT: WIDEN store vp<[[VPTR3]]>, ir<%add> +; CHECK-NEXT: WIDEN store vp<[[VPTR4]]>, ir<%add>.1 ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV:%.+]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-cond ir ; CHECK-NEXT: No successors