From d3614bcbf77ac92b1f1dd37711392f828bd695f3 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 25 Sep 2024 12:20:07 +0100 Subject: [PATCH 01/25] [VPlan] Use ResumePhi to create reduction resume phis. Use VPInstruction::ResumePhi to create phi nodes for reduction resume values. This allows simplifying createAndCollectMergePhiForReduction to only collect reduction resume phis when vectorizing epilogue loops and adding extra incoming edges from the main vector loop. --- .../Transforms/Vectorize/LoopVectorize.cpp | 82 +++++++++---------- .../RISCV/vplan-vp-intrinsics-reduction.ll | 9 ++ ...-order-recurrence-sink-replicate-region.ll | 2 + .../LoopVectorize/vplan-printing.ll | 9 ++ 4 files changed, 61 insertions(+), 41 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index db650b23e271e..74104304301a8 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7467,23 +7467,31 @@ static void addRuntimeUnrollDisableMetaData(Loop *L) { } // Check if \p RedResult is a ComputeReductionResult instruction, and if it is -// create a merge phi node for it. -static void createAndCollectMergePhiForReduction( - VPInstruction *RedResult, - VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock, - bool VectorizingEpilogue) { +// create a merge phi node for it and add incoming values from the main vector +// loop. +static void updateAndCollectMergePhiForReductionForEpilogueVectorization( + VPInstruction *RedResult, VPTransformState &State, Loop *OrigLoop, + BasicBlock *LoopMiddleBlock, bool VectorizingEpilogue) { if (!RedResult || RedResult->getOpcode() != VPInstruction::ComputeReductionResult) return; + using namespace VPlanPatternMatch; + VPValue *ResumePhiVPV = + cast(*find_if(RedResult->users(), [](VPUser *U) { + return match(U, m_VPInstruction(m_VPValue(), + m_VPValue())); + })); + auto *BCBlockPhi = cast(State.get(ResumePhiVPV, true)); auto *PhiR = cast(RedResult->getOperand(0)); const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); + if (!VectorizingEpilogue) + return; - Value *FinalValue = State.get(RedResult, VPLane(VPLane::getFirstLane())); auto *ResumePhi = dyn_cast(PhiR->getStartValue()->getUnderlyingValue()); - if (VectorizingEpilogue && RecurrenceDescriptor::isAnyOfRecurrenceKind( - RdxDesc.getRecurrenceKind())) { + if (RecurrenceDescriptor::isAnyOfRecurrenceKind( + RdxDesc.getRecurrenceKind())) { auto *Cmp = cast(PhiR->getStartValue()->getUnderlyingValue()); assert(Cmp->getPredicate() == CmpInst::ICMP_NE); assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue()); @@ -7493,40 +7501,15 @@ static void createAndCollectMergePhiForReduction( "when vectorizing the epilogue loop, we need a resume phi from main " "vector loop"); - // TODO: bc.merge.rdx should not be created here, instead it should be - // modeled in VPlan. BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader(); - // Create a phi node that merges control-flow from the backedge-taken check - // block and the middle block. - auto *BCBlockPhi = - PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx", - LoopScalarPreHeader->getTerminator()->getIterator()); - // If we are fixing reductions in the epilogue loop then we should already // have created a bc.merge.rdx Phi after the main vector body. Ensure that // we carry over the incoming values correctly. for (auto *Incoming : predecessors(LoopScalarPreHeader)) { - if (Incoming == LoopMiddleBlock) - BCBlockPhi->addIncoming(FinalValue, Incoming); - else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming)) - BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), - Incoming); - else - BCBlockPhi->addIncoming(RdxDesc.getRecurrenceStartValue(), Incoming); + if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming)) + BCBlockPhi->setIncomingValueForBlock( + Incoming, ResumePhi->getIncomingValueForBlock(Incoming)); } - - auto *OrigPhi = cast(PhiR->getUnderlyingValue()); - // TODO: This fixup should instead be modeled in VPlan. - // Fix the scalar loop reduction variable with the incoming reduction sum - // from the vector body and from the backedge value. - int IncomingEdgeBlockIdx = - OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); - assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); - // Pick the other block. - int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); - OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); - Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); - OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); } DenseMap LoopVectorizationPlanner::executePlan( @@ -7617,11 +7600,12 @@ DenseMap LoopVectorizationPlanner::executePlan( // 2.5 Collect reduction resume values. auto *ExitVPBB = cast(BestVPlan.getVectorLoopRegion()->getSingleSuccessor()); - for (VPRecipeBase &R : *ExitVPBB) { - createAndCollectMergePhiForReduction( - dyn_cast(&R), State, OrigLoop, - State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs); - } + if (IsEpilogueVectorization) + for (VPRecipeBase &R : *ExitVPBB) { + updateAndCollectMergePhiForReductionForEpilogueVectorization( + dyn_cast(&R), State, OrigLoop, + State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs); + } // 2.6. Maintain Loop Hints // Keep all loop hints from the original loop on the vector loop (we'll @@ -9411,6 +9395,22 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( }); FinalReductionResult->insertBefore(*MiddleVPBB, IP); + VPBasicBlock *ScalarPHVPBB = nullptr; + if (MiddleVPBB->getNumSuccessors() == 2) { + // Order is strict: first is the exit block, second is the scalar + // preheader. + ScalarPHVPBB = cast(MiddleVPBB->getSuccessors()[1]); + } else { + ScalarPHVPBB = cast(MiddleVPBB->getSingleSuccessor()); + } + + VPBuilder ScalarPHBuilder(ScalarPHVPBB); + auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp( + VPInstruction::ResumePhi, {FinalReductionResult, PhiR->getStartValue()}, + {}, "bc.merge.rdx"); + auto *RedPhi = cast(PhiR->getUnderlyingInstr()); + Plan->addLiveOut(RedPhi, ResumePhiRecipe); + // Adjust AnyOf reductions; replace the reduction phi for the selected value // with a boolean reduction phi node to check if the condition is true in // any iteration. The final value is selected by the final diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll index 90c209cf3f518..6a435709aeb2b 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll @@ -65,7 +65,10 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: No successors ; IF-EVL-INLOOP-EMPTY: ; IF-EVL-INLOOP-NEXT: scalar.ph: +; IF-EVL-INLOOP-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start> ; IF-EVL-INLOOP-NEXT: No successors +; IF-EVL-INLOOP-EMPTY: +; IF-EVL-INLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]> ; IF-EVL-INLOOP-NEXT: } ; @@ -104,7 +107,10 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; NO-VP-OUTLOOP-NEXT: No successors ; NO-VP-OUTLOOP-EMPTY: ; NO-VP-OUTLOOP-NEXT: scalar.ph: +; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start> ; NO-VP-OUTLOOP-NEXT: No successors +; NO-VP-OUTLOOP-EMPTY: +; NO-VP-OUTLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]> ; NO-VP-OUTLOOP-NEXT: } ; @@ -143,7 +149,10 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; NO-VP-INLOOP-NEXT: No successors ; NO-VP-INLOOP-EMPTY: ; NO-VP-INLOOP-NEXT: scalar.ph: +; NO-VP-INLOOP-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start> ; NO-VP-INLOOP-NEXT: No successors +; NO-VP-INLOOP-EMPTY: +; NO-VP-INLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]> ; NO-VP-INLOOP-NEXT: } ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll index 8e56614a2e3d5..b05980bef1b38 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll @@ -232,9 +232,11 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph ; CHECK-NEXT: EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0> +; CHECK-NEXT: EMIT vp<[[RESUME_RED:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<1234> ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: Live-out i32 %recur = vp<[[RESUME_1_P]]> +; CHECK-NEXT: Live-out i32 %and.red = vp<[[RESUME_RED]]> ; CHECK-NEXT: } ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index 0dde507d08be7..2247295295663 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -165,7 +165,10 @@ define float @print_reduction(i64 %n, ptr noalias %y) { ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph +; CHECK-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00> ; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: Live-out float %red = vp<[[RED_RESUME]]> ; CHECK-NEXT: } ; entry: @@ -221,7 +224,10 @@ define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr no ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph +; CHECK-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00> ; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: Live-out float %red = vp<[[RED_RESUME]]> ; CHECK-NEXT: } ; entry: @@ -447,7 +453,10 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) { ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph +; CHECK-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00> ; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: Live-out float %sum.07 = vp<[[RED_RESUME]]> ; CHECK-NEXT:} entry: From 5f8fabe01a7c6ee8e9ca37e9566e52467983bff6 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 25 Sep 2024 12:18:39 +0100 Subject: [PATCH 02/25] [VPlan] Remove loop region in optimizeForVFAndUF. --- .../Transforms/Vectorize/LoopVectorize.cpp | 75 ++++--- llvm/lib/Transforms/Vectorize/VPlan.cpp | 143 +++++++------ llvm/lib/Transforms/Vectorize/VPlan.h | 5 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 9 +- .../Transforms/Vectorize/VPlanTransforms.cpp | 44 +++- .../LoopVectorize/AArch64/call-costs.ll | 17 +- .../LoopVectorize/RISCV/low-trip-count.ll | 192 ++++++++---------- .../LoopVectorize/RISCV/short-trip-count.ll | 44 ++-- .../truncate-to-minimal-bitwidth-cost.ll | 27 +-- .../LoopVectorize/SystemZ/pr47665.ll | 77 +++---- ...demanding-all-lanes-and-first-lane-only.ll | 56 +++-- .../LoopVectorize/X86/constant-fold.ll | 18 +- .../Transforms/LoopVectorize/X86/pr34438.ll | 26 +-- .../LoopVectorize/first-order-recurrence.ll | 33 +-- .../vector-loop-backedge-elimination.ll | 4 +- .../version-stride-with-integer-casts.ll | 15 +- 16 files changed, 359 insertions(+), 426 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 74104304301a8..a573147c41241 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2927,6 +2927,9 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, for (const auto &KV : Plan.getLiveOuts()) KV.second->fixPhi(Plan, State); + if (!isa(State.Plan->getEntry()->getSingleSuccessor())) + return; + for (Instruction *PI : PredicatedInstructions) sinkScalarOperands(&*PI); @@ -7537,7 +7540,8 @@ DenseMap LoopVectorizationPlanner::executePlan( LLVM_DEBUG(BestVPlan.dump()); // Perform the actual loop transformation. - VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan); + VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan, + Legal->getWidestInductionType()); // 0. Generate SCEV-dependent code into the preheader, including TripCount, // before making any changes to the CFG. @@ -7598,14 +7602,15 @@ DenseMap LoopVectorizationPlanner::executePlan( BestVPlan.execute(&State); // 2.5 Collect reduction resume values. - auto *ExitVPBB = - cast(BestVPlan.getVectorLoopRegion()->getSingleSuccessor()); - if (IsEpilogueVectorization) + if (IsEpilogueVectorization) { + auto *ExitVPBB = cast( + BestVPlan.getVectorLoopRegion()->getSingleSuccessor()); for (VPRecipeBase &R : *ExitVPBB) { updateAndCollectMergePhiForReductionForEpilogueVectorization( dyn_cast(&R), State, OrigLoop, State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs); } + } // 2.6. Maintain Loop Hints // Keep all loop hints from the original loop on the vector loop (we'll @@ -7616,24 +7621,26 @@ DenseMap LoopVectorizationPlanner::executePlan( makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, LLVMLoopVectorizeFollowupVectorized}); - VPBasicBlock *HeaderVPBB = - BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); - Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); - if (VectorizedLoopID) - L->setLoopID(*VectorizedLoopID); - else { - // Keep all loop hints from the original loop on the vector loop (we'll - // replace the vectorizer-specific hints below). - if (MDNode *LID = OrigLoop->getLoopID()) - L->setLoopID(LID); - - LoopVectorizeHints Hints(L, true, *ORE); - Hints.setAlreadyVectorized(); + if (auto *R = + dyn_cast(BestVPlan.getEntry()->getSingleSuccessor())) { + VPBasicBlock *HeaderVPBB = R->getEntryBasicBlock(); + Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); + if (VectorizedLoopID) + L->setLoopID(*VectorizedLoopID); + else { + // Keep all loop hints from the original loop on the vector loop (we'll + // replace the vectorizer-specific hints below). + if (MDNode *LID = OrigLoop->getLoopID()) + L->setLoopID(LID); + + LoopVectorizeHints Hints(L, true, *ORE); + Hints.setAlreadyVectorized(); + } + TargetTransformInfo::UnrollingPreferences UP; + TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); + if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue) + addRuntimeUnrollDisableMetaData(L); } - TargetTransformInfo::UnrollingPreferences UP; - TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); - if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue) - addRuntimeUnrollDisableMetaData(L); // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. @@ -7642,15 +7649,20 @@ DenseMap LoopVectorizationPlanner::executePlan( ILV.printDebugTracesAtEnd(); // 4. Adjust branch weight of the branch in the middle block. - auto *MiddleTerm = - cast(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator()); - if (MiddleTerm->isConditional() && - hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { - // Assume that `Count % VectorTripCount` is equally distributed. - unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue(); - assert(TripCount > 0 && "trip count should not be zero"); - const uint32_t Weights[] = {1, TripCount - 1}; - setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false); + if (auto *R = + dyn_cast(BestVPlan.getEntry()->getSingleSuccessor())) { + auto *ExitVPBB = cast(R->getSingleSuccessor()); + + auto *MiddleTerm = + cast(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator()); + if (MiddleTerm->isConditional() && + hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { + // Assume that `Count % VectorTripCount` is equally distributed. + unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue(); + assert(TripCount > 0 && "trip count should not be zero"); + const uint32_t Weights[] = {1, TripCount - 1}; + setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false); + } } return State.ExpandedSCEVs; @@ -9464,7 +9476,8 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) { State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step, Kind, cast_if_present(FPBinOp)); DerivedIV->setName("offset.idx"); - assert(DerivedIV != CanonicalIV && "IV didn't need transforming?"); + assert((isa(CanonicalIV) || DerivedIV != CanonicalIV) && + "IV didn't need transforming?"); State.set(this, DerivedIV, VPLane(0)); } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 5e3a638809494..27f74e87bdf42 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -224,9 +224,10 @@ VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() { VPTransformState::VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder, - InnerLoopVectorizer *ILV, VPlan *Plan) + InnerLoopVectorizer *ILV, VPlan *Plan, + Type *CanonicalIVTy) : VF(VF), CFG(DT), LI(LI), Builder(Builder), ILV(ILV), Plan(Plan), - LVer(nullptr), TypeAnalysis(Plan->getCanonicalIV()->getScalarType()) {} + LVer(nullptr), TypeAnalysis(CanonicalIVTy) {} Value *VPTransformState::get(VPValue *Def, const VPLane &Lane) { if (Def->isLiveIn()) @@ -275,8 +276,8 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) { // Place the code for broadcasting invariant variables in the new preheader. IRBuilder<>::InsertPointGuard Guard(Builder); if (SafeToHoist) { - BasicBlock *LoopVectorPreHeader = CFG.VPBB2IRBB[cast( - Plan->getVectorLoopRegion()->getSinglePredecessor())]; + BasicBlock *LoopVectorPreHeader = + CFG.VPBB2IRBB[cast(Plan->getEntry())]; if (LoopVectorPreHeader) Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); } @@ -417,6 +418,12 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) { PrevBB->getParent(), CFG.ExitBB); LLVM_DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n'); + connectToPredecessors(NewBB, CFG); + return NewBB; +} + +void VPBasicBlock::connectToPredecessors(BasicBlock *NewBB, + VPTransformState::CFGState &CFG) { // Hook up the new basic block to its predecessors. for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) { VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock(); @@ -447,38 +454,14 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) { } CFG.DTU.applyUpdates({{DominatorTree::Insert, PredBB, NewBB}}); } - return NewBB; } - void VPIRBasicBlock::execute(VPTransformState *State) { assert(getHierarchicalSuccessors().size() <= 2 && "VPIRBasicBlock can have at most two successors at the moment!"); State->Builder.SetInsertPoint(getIRBasicBlock()->getTerminator()); executeRecipes(State, getIRBasicBlock()); - if (getSingleSuccessor()) { - assert(isa(getIRBasicBlock()->getTerminator())); - auto *Br = State->Builder.CreateBr(getIRBasicBlock()); - Br->setOperand(0, nullptr); - getIRBasicBlock()->getTerminator()->eraseFromParent(); - } - - for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) { - VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock(); - BasicBlock *PredBB = State->CFG.VPBB2IRBB[PredVPBB]; - assert(PredBB && "Predecessor basic-block not found building successor."); - LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n'); - auto *PredBBTerminator = PredBB->getTerminator(); - auto *TermBr = cast(PredBBTerminator); - // Set each forward successor here when it is created, excluding - // backedges. A backward successor is set when the branch is created. - const auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors(); - unsigned idx = PredVPSuccessors.front() == this ? 0 : 1; - assert(!TermBr->getSuccessor(idx) && - "Trying to reset an existing successor block."); - TermBr->setSuccessor(idx, IRBB); - State->CFG.DTU.applyUpdates({{DominatorTree::Insert, PredBB, IRBB}}); - } + connectToPredecessors(getIRBasicBlock(), State->CFG); } void VPBasicBlock::execute(VPTransformState *State) { @@ -962,7 +945,6 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); // FIXME: Model VF * UF computation completely in VPlan. - assert(VFxUF.getNumUsers() && "VFxUF expected to always have users"); unsigned UF = getUF(); if (VF.getNumUsers()) { Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF); @@ -1034,8 +1016,13 @@ void VPlan::execute(VPTransformState *State) { // skeleton creation, so we can only create the VPIRBasicBlocks now during // VPlan execution rather than earlier during VPlan construction. BasicBlock *MiddleBB = State->CFG.ExitBB; - VPBasicBlock *MiddleVPBB = - cast(getVectorLoopRegion()->getSingleSuccessor()); + VPBlockBase *Leaf = nullptr; + for (VPBlockBase *VPB : vp_depth_first_shallow(getEntry())) + if (VPB->getNumSuccessors() == 0) { + Leaf = VPB; + break; + } + VPBasicBlock *MiddleVPBB = cast(Leaf->getSinglePredecessor()); // Find the VPBB for the scalar preheader, relying on the current structure // when creating the middle block and its successrs: if there's a single // predecessor, it must be the scalar preheader. Otherwise, the second @@ -1063,53 +1050,59 @@ void VPlan::execute(VPTransformState *State) { for (VPBlockBase *Block : vp_depth_first_shallow(Entry)) Block->execute(State); - VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock(); - BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB]; - - // Fix the latch value of canonical, reduction and first-order recurrences - // phis in the vector loop. - VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock(); - for (VPRecipeBase &R : Header->phis()) { - // Skip phi-like recipes that generate their backedege values themselves. - if (isa(&R)) - continue; - - if (isa(&R) || - isa(&R)) { - PHINode *Phi = nullptr; - if (isa(&R)) { - Phi = cast(State->get(R.getVPSingleValue())); - } else { - auto *WidenPhi = cast(&R); - assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) && - "recipe generating only scalars should have been replaced"); - auto *GEP = cast(State->get(WidenPhi)); - Phi = cast(GEP->getPointerOperand()); - } - - Phi->setIncomingBlock(1, VectorLatchBB); + if (auto *LoopRegion = + dyn_cast(getEntry()->getSingleSuccessor())) { + VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock(); + BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB]; + + // Fix the latch value of canonical, reduction and first-order recurrences + // phis in the vector loop. + VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); + for (VPRecipeBase &R : Header->phis()) { + // Skip phi-like recipes that generate their backedege values themselves. + if (isa(&R)) + continue; - // Move the last step to the end of the latch block. This ensures - // consistent placement of all induction updates. - Instruction *Inc = cast(Phi->getIncomingValue(1)); - Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode()); + if (isa(&R) || + isa(&R)) { + PHINode *Phi = nullptr; + if (isa(&R)) { + Phi = cast(State->get(R.getVPSingleValue())); + } else { + auto *WidenPhi = cast(&R); + assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) && + "recipe generating only scalars should have been replaced"); + auto *GEP = cast(State->get(WidenPhi)); + Phi = cast(GEP->getPointerOperand()); + } + + Phi->setIncomingBlock(1, VectorLatchBB); + + // Move the last step to the end of the latch block. This ensures + // consistent placement of all induction updates. + Instruction *Inc = cast(Phi->getIncomingValue(1)); + Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode()); + + // Use the steps for the last part as backedge value for the induction. + if (auto *IV = dyn_cast(&R)) + Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand())); + continue; + } - // Use the steps for the last part as backedge value for the induction. - if (auto *IV = dyn_cast(&R)) - Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand())); - continue; + // For canonical IV, first-order recurrences and in-order reduction phis, + // only a single part is generated, which provides the last part from the + // previous iteration. For non-ordered reductions all UF parts are + // generated. + auto *PhiR = cast(&R); + bool NeedsScalar = + isa(PhiR) || + (isa(PhiR) && + cast(PhiR)->isInLoop()); + Value *Phi = State->get(PhiR, NeedsScalar); + Value *Val = State->get(PhiR->getBackedgeValue(), NeedsScalar); + cast(Phi)->addIncoming(Val, VectorLatchBB); } - - auto *PhiR = cast(&R); - bool NeedsScalar = - isa(PhiR) || - (isa(PhiR) && - cast(PhiR)->isInLoop()); - Value *Phi = State->get(PhiR, NeedsScalar); - Value *Val = State->get(PhiR->getBackedgeValue(), NeedsScalar); - cast(Phi)->addIncoming(Val, VectorLatchBB); } - State->CFG.DTU.flush(); assert(State->CFG.DTU.getDomTree().verify( DominatorTree::VerificationLevel::Fast) && diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 68a62638b9d58..986092ca9b11f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -236,7 +236,7 @@ class VPLane { struct VPTransformState { VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder, - InnerLoopVectorizer *ILV, VPlan *Plan); + InnerLoopVectorizer *ILV, VPlan *Plan, Type *CanonicalIVTy); /// The chosen Vectorization Factor of the loop being vectorized. ElementCount VF; @@ -3378,6 +3378,8 @@ class VPBasicBlock : public VPBlockBase { protected: /// Execute the recipes in the IR basic block \p BB. void executeRecipes(VPTransformState *State, BasicBlock *BB); + void connectToPredecessors(BasicBlock *NewBB, + VPTransformState::CFGState &CFG); private: /// Create an IR BasicBlock to hold the output instructions generated by this @@ -3499,6 +3501,7 @@ class VPRegionBlock : public VPBlockBase { assert(!isReplicator() && "should only get pre-header of loop regions"); return getSinglePredecessor()->getExitingBasicBlock(); } + void clearEntry() { Entry = nullptr; } /// An indicator whether this region is to generate multiple replicated /// instances of output IR corresponding to its VPBlockBases. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index ba94cd2958766..efc924e5b5802 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -210,8 +210,9 @@ bool VPRecipeBase::mayHaveSideEffects() const { void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { VPValue *ExitValue = getOperand(0); - VPBasicBlock *MiddleVPBB = - cast(Plan.getVectorLoopRegion()->getSingleSuccessor()); + auto *Region = dyn_cast(Plan.getEntry()->getSingleSuccessor()); + VPBasicBlock *MiddleVPBB = dyn_cast_or_null( + Region ? Region->getSingleSuccessor() : nullptr); VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe(); auto *ExitingVPBB = ExitingRecipe ? ExitingRecipe->getParent() : nullptr; // Values leaving the vector loop reach live out phi's in the exiting block @@ -2208,7 +2209,9 @@ void VPBranchOnMaskRecipe::execute(VPTransformState &State) { // Replace the temporary unreachable terminator with a new conditional branch, // whose two destinations will be set later when they are created. auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); - assert(isa(CurrentTerminator) && + assert((isa(CurrentTerminator) || + (isa(CurrentTerminator) && + !CurrentTerminator->getOperand(0))) && "Expected to replace unreachable terminator with conditional branch."); auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); CondBr->setSuccessor(0, nullptr); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 379bfc0a4394b..f2db068e046ca 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -696,16 +696,46 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C)) return; - LLVMContext &Ctx = SE.getContext(); - auto *BOC = - new VPInstruction(VPInstruction::BranchOnCond, - {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}); - SmallVector PossiblyDead(Term->operands()); Term->eraseFromParent(); + VPBasicBlock *Header = + cast(Plan.getVectorLoopRegion()->getEntry()); + if (all_of(Header->phis(), [](VPRecipeBase &R) { + return !isa(&R); + })) { + for (VPRecipeBase &R : make_early_inc_range(Header->phis())) { + auto *P = cast(&R); + P->replaceAllUsesWith(P->getStartValue()); + P->eraseFromParent(); + } + + VPBlockBase *Preheader = Plan.getVectorLoopRegion()->getSinglePredecessor(); + auto HeaderSuccs = to_vector(Header->getSuccessors()); + VPBasicBlock *Exiting = + cast(Plan.getVectorLoopRegion()->getExiting()); + + auto *LoopRegion = Plan.getVectorLoopRegion(); + VPBlockBase *Middle = LoopRegion->getSingleSuccessor(); + VPBlockUtils::disconnectBlocks(Preheader, LoopRegion); + VPBlockUtils::disconnectBlocks(LoopRegion, Middle); + + Header->setParent(nullptr); + Exiting->setParent(nullptr); + VPBlockUtils::connectBlocks(Preheader, Header); + + VPBlockUtils::connectBlocks(Exiting, Middle); + LoopRegion->clearEntry(); + delete LoopRegion; + } else { + LLVMContext &Ctx = SE.getContext(); + auto *BOC = + new VPInstruction(VPInstruction::BranchOnCond, + {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}); + + ExitingVPBB->appendRecipe(BOC); + } for (VPValue *Op : PossiblyDead) recursivelyDeleteDeadRecipes(Op); - ExitingVPBB->appendRecipe(BOC); Plan.setVF(BestVF); Plan.setUF(BestUF); // TODO: Further simplifications are possible @@ -714,7 +744,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, } /// Sink users of \p FOR after the recipe defining the previous value \p -/// Previous of the recurrence. \returns true if all users of \p FOR could be +// Previous of the recurrence. \returns true if all users of \p FOR could be /// re-arranged as needed or false if it is not possible. static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll index dcb8ba7366166..47886b28439dd 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll @@ -78,17 +78,13 @@ define void @powi_call(ptr %P) { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[WIDE_LOAD]], i32 3) -; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TMP2]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TMP4]], align 8 +; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: @@ -102,7 +98,7 @@ define void @powi_call(ptr %P) { ; CHECK-NEXT: store double [[POWI]], ptr [[GEP]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 1 -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -233,6 +229,5 @@ declare i64 @llvm.fshl.i64(i64, i64, i64) ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} -; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} -; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll index ec50b0cac0382..516c5d0f08274 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll @@ -49,28 +49,24 @@ define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 3, [[TMP4]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 3, [[TMP2]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP7]], i64 3) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 3) +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i8.p0(ptr [[TMP6]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP7:%.*]] = shl [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i8.p0(ptr [[TMP9]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP10:%.*]] = shl [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv2i8.p0(ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: call void @llvm.masked.store.nxv2i8.p0( [[TMP13]], ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv2i8.p0(ptr [[TMP9]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP10:%.*]] = add [[TMP7]], [[WIDE_MASKED_LOAD1]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv2i8.p0( [[TMP10]], ptr [[TMP11]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -79,15 +75,15 @@ define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]] -; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP14]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP12]], 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] -; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]] +; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP13]] ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 3 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -118,28 +114,24 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 5, [[TMP4]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 5, [[TMP2]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP7]], i64 5) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 5) +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP6]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP7:%.*]] = shl [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP10:%.*]] = shl [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0( [[TMP13]], ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP10:%.*]] = add [[TMP7]], [[WIDE_MASKED_LOAD1]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0( [[TMP10]], ptr [[TMP11]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -148,15 +140,15 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]] -; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP14]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP12]], 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] -; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]] +; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP13]] ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 5 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -187,28 +179,24 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 8, [[TMP4]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 8, [[TMP2]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP7]], i64 8) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 8) +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP6]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP7:%.*]] = shl [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP10:%.*]] = shl [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0( [[TMP13]], ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP10:%.*]] = add [[TMP7]], [[WIDE_MASKED_LOAD1]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0( [[TMP10]], ptr [[TMP11]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -217,15 +205,15 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]] -; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP14]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP12]], 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] -; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]] +; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP13]] ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 8 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -254,21 +242,17 @@ define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = shl <16 x i8> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = add <16 x i8> [[TMP3]], [[WIDE_LOAD1]] -; CHECK-NEXT: store <16 x i8> [[TMP6]], ptr [[TMP5]], align 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = shl <16 x i8> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = add <16 x i8> [[TMP2]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: store <16 x i8> [[TMP5]], ptr [[TMP6]], align 1 +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -285,7 +269,7 @@ define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -315,21 +299,17 @@ define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP2]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = shl <32 x i8> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP5]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = add <32 x i8> [[TMP3]], [[WIDE_LOAD1]] -; CHECK-NEXT: store <32 x i8> [[TMP6]], ptr [[TMP5]], align 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP1]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = shl <32 x i8> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = add <32 x i8> [[TMP2]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: store <32 x i8> [[TMP5]], ptr [[TMP6]], align 1 +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -346,7 +326,7 @@ define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 32 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -390,7 +370,7 @@ define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[TMP5]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -407,7 +387,7 @@ define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 24 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll index bb716d78ca411..71244694ae299 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll @@ -12,18 +12,14 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP0]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i32(i32 [[TMP3]], i32 4) -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv1i32.p0(ptr [[TMP5]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP6:%.*]] = add nsw [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: call void @llvm.masked.store.nxv1i32.p0( [[TMP6]], ptr [[TMP5]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP2]] -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 4) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv1i32.p0(ptr [[TMP4]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP5:%.*]] = add nsw [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv1i32.p0( [[TMP5]], ptr [[TMP6]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -37,7 +33,7 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran ; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[IV]], 3 -; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -71,18 +67,14 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-NEXT: [[TMP4:%.*]] = mul i32 [[TMP3]], 4 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 [[TMP5]], i32 4) -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP7]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP8:%.*]] = add nsw [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP8]], ptr [[TMP7]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP4]] -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP6]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP7:%.*]] = add nsw [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP7]], ptr [[TMP8]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -96,7 +88,7 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang ; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[IV]], 3 -; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll index fa13cd8f19ae5..6eface402f6df 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll @@ -163,20 +163,12 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 { ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i8> [[BROADCAST_SPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP0:%.*]] = trunc <2 x i8> [[BROADCAST_SPLAT]] to <2 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i1> , [[TMP0]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 0, i32 2) +; CHECK-NEXT: [[TMP3:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP2]], <2 x i1> zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT3]], <2 x ptr> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i32> poison, i32 [[INDEX]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT1]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[VEC_IV:%.*]] = add <2 x i32> [[BROADCAST_SPLAT2]], -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[VEC_IV]], i32 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 [[TMP1]], i32 2) -; CHECK-NEXT: [[TMP3:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP2]], <2 x i1> zeroinitializer ; CHECK-NEXT: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> zeroinitializer, <2 x ptr> [[BROADCAST_SPLAT4]], i32 1, <2 x i1> [[TMP3]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 -; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: @@ -194,7 +186,7 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 { ; CHECK-NEXT: [[ADD]] = add i8 [[F_039]], 1 ; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[F_039]] to i32 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[CONV]], 1 -; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -293,7 +285,7 @@ define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64 ; CHECK-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0( [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT6]], i32 8, [[TMP8]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -314,7 +306,7 @@ define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64 ; CHECK: [[LOOP_LATCH]]: ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[V]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -354,8 +346,7 @@ attributes #1 = { "target-features"="+64bit,+v" } ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} -; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} -; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} -; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} -; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll index 5e8f287217478..11344c7ff00bb 100644 --- a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll +++ b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll @@ -17,127 +17,105 @@ define void @test(ptr %p, i40 %a) { ; CHECK-NEXT: [[TMP6:%.*]] = icmp ult <16 x i1> zeroinitializer, [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i1> [[TMP6]], ; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt <16 x i1> [[TMP7]], zeroinitializer -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE32:%.*]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i32> poison, i32 [[INDEX]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT2]], <16 x i32> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[VEC_IV:%.*]] = add <16 x i32> [[BROADCAST_SPLAT3]], -; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <16 x i32> [[VEC_IV]], -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x i1> [[TMP0]], i32 0 -; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 ; CHECK-NEXT: store i1 [[TMP10]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <16 x i1> [[TMP0]], i32 1 -; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] +; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] +; CHECK: pred.store.if1: +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: store i1 [[TMP9]], ptr [[P]], align 1 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]] +; CHECK: pred.store.continue2: +; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] ; CHECK: pred.store.if3: ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 ; CHECK-NEXT: store i1 [[TMP12]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] ; CHECK: pred.store.continue4: -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x i1> [[TMP0]], i32 2 -; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] +; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] ; CHECK: pred.store.if5: ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 ; CHECK-NEXT: store i1 [[TMP14]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] ; CHECK: pred.store.continue6: -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x i1> [[TMP0]], i32 3 -; CHECK-NEXT: br i1 [[TMP15]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] +; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] ; CHECK: pred.store.if7: ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 ; CHECK-NEXT: store i1 [[TMP16]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]] ; CHECK: pred.store.continue8: -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP0]], i32 4 -; CHECK-NEXT: br i1 [[TMP17]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] +; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] ; CHECK: pred.store.if9: ; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 ; CHECK-NEXT: store i1 [[TMP18]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE10]] ; CHECK: pred.store.continue10: -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <16 x i1> [[TMP0]], i32 5 -; CHECK-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]] +; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]] ; CHECK: pred.store.if11: ; CHECK-NEXT: [[TMP20:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 ; CHECK-NEXT: store i1 [[TMP20]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE12]] ; CHECK: pred.store.continue12: -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <16 x i1> [[TMP0]], i32 6 -; CHECK-NEXT: br i1 [[TMP21]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]] +; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]] ; CHECK: pred.store.if13: ; CHECK-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 ; CHECK-NEXT: store i1 [[TMP22]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE14]] ; CHECK: pred.store.continue14: -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <16 x i1> [[TMP0]], i32 7 -; CHECK-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]] +; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]] ; CHECK: pred.store.if15: ; CHECK-NEXT: [[TMP24:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 ; CHECK-NEXT: store i1 [[TMP24]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE16]] ; CHECK: pred.store.continue16: -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <16 x i1> [[TMP0]], i32 8 -; CHECK-NEXT: br i1 [[TMP25]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]] +; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]] ; CHECK: pred.store.if17: ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 ; CHECK-NEXT: store i1 [[TMP26]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE18]] ; CHECK: pred.store.continue18: -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP0]], i32 9 -; CHECK-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]] +; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]] ; CHECK: pred.store.if19: ; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 ; CHECK-NEXT: store i1 [[TMP28]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE20]] ; CHECK: pred.store.continue20: -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <16 x i1> [[TMP0]], i32 10 -; CHECK-NEXT: br i1 [[TMP29]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]] +; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]] ; CHECK: pred.store.if21: ; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 ; CHECK-NEXT: store i1 [[TMP30]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE22]] ; CHECK: pred.store.continue22: -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <16 x i1> [[TMP0]], i32 11 -; CHECK-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]] +; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]] ; CHECK: pred.store.if23: ; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 ; CHECK-NEXT: store i1 [[TMP32]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE24]] ; CHECK: pred.store.continue24: -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <16 x i1> [[TMP0]], i32 12 -; CHECK-NEXT: br i1 [[TMP33]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]] +; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]] ; CHECK: pred.store.if25: ; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 ; CHECK-NEXT: store i1 [[TMP34]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE26]] ; CHECK: pred.store.continue26: -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <16 x i1> [[TMP0]], i32 13 -; CHECK-NEXT: br i1 [[TMP35]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]] +; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]] ; CHECK: pred.store.if27: ; CHECK-NEXT: [[TMP36:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 ; CHECK-NEXT: store i1 [[TMP36]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE28]] ; CHECK: pred.store.continue28: -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <16 x i1> [[TMP0]], i32 14 -; CHECK-NEXT: br i1 [[TMP37]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]] +; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]] ; CHECK: pred.store.if29: ; CHECK-NEXT: [[TMP38:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 ; CHECK-NEXT: store i1 [[TMP38]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE30]] ; CHECK: pred.store.continue30: -; CHECK-NEXT: [[TMP39:%.*]] = extractelement <16 x i1> [[TMP0]], i32 15 -; CHECK-NEXT: br i1 [[TMP39]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32]] -; CHECK: pred.store.if31: -; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 -; CHECK-NEXT: store i1 [[TMP40]], ptr [[P]], align 1 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE32]] -; CHECK: pred.store.continue32: -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE32:%.*]] +; CHECK: for.body.0: +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -156,7 +134,7 @@ define void @test(ptr %p, i40 %a) { ; CHECK-NEXT: store i1 [[ICMP_SGT]], ptr [[P]], align 1 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp ult i32 [[IV_NEXT]], 10 -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -183,7 +161,6 @@ exit: ; preds = %for.body } ;. ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[META2]] = !{!"llvm.loop.isvectorized", i32 1} ;. diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll index fcf1ba072a62c..fa39fc262ef5a 100644 --- a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll +++ b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll @@ -14,32 +14,22 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP4:%.*]] = mul nsw i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = mul nsw i64 [[TMP1]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = mul nsw i64 [[TMP2]], 4 -; CHECK-NEXT: [[TMP7:%.*]] = mul nsw i64 [[TMP3]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = mul nsw i64 0, 4 +; CHECK-NEXT: [[TMP1:%.*]] = mul nsw i64 0, 4 +; CHECK-NEXT: [[TMP2:%.*]] = mul nsw i64 0, 4 +; CHECK-NEXT: [[TMP3:%.*]] = mul nsw i64 0, 4 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[TMP8]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[TMP9]], align 1 -; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[TMP10]], align 1 -; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[TMP11]], align 1 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[TMP13]], i32 1 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i8> [[TMP17]], i8 [[TMP14]], i32 2 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> [[TMP18]], i8 [[TMP15]], i32 3 +; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i64 0 +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr [8 x i32], ptr @src, i64 0, i64 [[TMP21]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr [8 x i32], ptr @src, i64 0, i64 4 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP23]], align 4 ; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP20]], i32 0 @@ -52,26 +42,27 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[TMP20]], i32 1 ; CHECK-NEXT: br i1 [[TMP26]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]] ; CHECK: [[PRED_STORE_IF1]]: -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 0 ; CHECK-NEXT: store i32 [[TMP27]], ptr [[DST]], align 4 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]] ; CHECK: [[PRED_STORE_CONTINUE2]]: ; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i1> [[TMP20]], i32 2 ; CHECK-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] ; CHECK: [[PRED_STORE_IF3]]: -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 2 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 0 ; CHECK-NEXT: store i32 [[TMP29]], ptr [[DST]], align 4 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE4]] ; CHECK: [[PRED_STORE_CONTINUE4]]: ; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP20]], i32 3 -; CHECK-NEXT: br i1 [[TMP30]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6]] +; CHECK-NEXT: br i1 [[TMP30]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]] ; CHECK: [[PRED_STORE_IF5]]: -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3 +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 0 ; CHECK-NEXT: store i32 [[TMP31]], ptr [[DST]], align 4 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]] ; CHECK: [[PRED_STORE_CONTINUE6]]: -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br label %[[THEN_0:.*]] +; CHECK: [[THEN_0]]: +; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: @@ -93,7 +84,7 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p ; CHECK: [[LOOP_LATCH]]: ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 4 -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -125,7 +116,6 @@ exit: } ;. ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[META2]] = !{!"llvm.loop.isvectorized", i32 1} ;. diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll index 61cae9c1b3f5d..1e6e38632701f 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll @@ -15,17 +15,11 @@ define void @f1() { ; CHECK-NEXT: bb1: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16 -; CHECK-NEXT: [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[TMP0]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [2 x ptr], ptr @b, i16 0, i64 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr ptr, ptr [[TMP2]], i32 0 -; CHECK-NEXT: store <2 x ptr> , ptr [[TMP3]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = sext i16 0 to i64 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr [2 x ptr], ptr @b, i16 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr ptr, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store <2 x ptr> , ptr [[TMP2]], align 8 +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[BB3:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -40,7 +34,7 @@ define void @f1() { ; CHECK-NEXT: store ptr [[_TMP2]], ptr [[_TMP7]], align 8 ; CHECK-NEXT: [[_TMP9]] = add nsw i16 [[C_1_0]], 1 ; CHECK-NEXT: [[_TMP11:%.*]] = icmp slt i16 [[_TMP9]], 2 -; CHECK-NEXT: br i1 [[_TMP11]], label [[BB2]], label [[BB3]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[_TMP11]], label [[BB2]], label [[BB3]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: bb3: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll index cc60359af2f8c..47662d155b545 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll @@ -14,20 +14,16 @@ define void @small_tc(ptr noalias nocapture %A, ptr noalias nocapture readonly % ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-NEXT: store <8 x float> [[TMP5]], ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; CHECK-NEXT: store <8 x float> [[TMP4]], ptr [[TMP5]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -43,7 +39,7 @@ define void @small_tc(ptr noalias nocapture %A, ptr noalias nocapture readonly % ; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll index c4e3e0b8c5a36..cd91eb9757c0a 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -3705,15 +3705,8 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) { ; UNROLL-NO-IC-NEXT: entry: ; UNROLL-NO-IC-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; UNROLL-NO-IC: vector.ph: -; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] -; UNROLL-NO-IC: vector.body: -; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4 -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0 -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 -; UNROLL-NO-IC-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; UNROLL-NO-IC-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-IC: scalar.ph: @@ -3726,7 +3719,7 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) { ; UNROLL-NO-IC-NEXT: [[ADD]] = add i64 [[PHI]], 1 ; UNROLL-NO-IC-NEXT: [[LOAD]] = load i32, ptr [[SRC]], align 4 ; UNROLL-NO-IC-NEXT: [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1 -; UNROLL-NO-IC-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP39:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]] ; UNROLL-NO-IC: exit: ; UNROLL-NO-IC-NEXT: ret i32 0 ; @@ -3734,13 +3727,8 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) { ; UNROLL-NO-VF-NEXT: entry: ; UNROLL-NO-VF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; UNROLL-NO-VF: vector.ph: -; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] -; UNROLL-NO-VF: vector.body: -; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[TMP0]] = load i32, ptr [[SRC:%.*]], align 4 -; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4 +; UNROLL-NO-VF-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: @@ -3753,7 +3741,7 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) { ; UNROLL-NO-VF-NEXT: [[ADD]] = add i64 [[PHI]], 1 ; UNROLL-NO-VF-NEXT: [[LOAD]] = load i32, ptr [[SRC]], align 4 ; UNROLL-NO-VF-NEXT: [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1 -; UNROLL-NO-VF-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP39:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]] ; UNROLL-NO-VF: exit: ; UNROLL-NO-VF-NEXT: ret i32 0 ; @@ -3761,15 +3749,8 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) { ; SINK-AFTER-NEXT: entry: ; SINK-AFTER-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SINK-AFTER: vector.ph: -; SINK-AFTER-NEXT: br label [[VECTOR_BODY:%.*]] -; SINK-AFTER: vector.body: -; SINK-AFTER-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SINK-AFTER-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT:%.*]], [[VECTOR_BODY]] ] ; SINK-AFTER-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4 -; SINK-AFTER-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0 -; SINK-AFTER-NEXT: [[BROADCAST_SPLAT]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; SINK-AFTER-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; SINK-AFTER-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; SINK-AFTER: middle.block: ; SINK-AFTER-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; SINK-AFTER: scalar.ph: @@ -3782,7 +3763,7 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) { ; SINK-AFTER-NEXT: [[ADD]] = add i64 [[PHI]], 1 ; SINK-AFTER-NEXT: [[LOAD]] = load i32, ptr [[SRC]], align 4 ; SINK-AFTER-NEXT: [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1 -; SINK-AFTER-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP39:![0-9]+]] +; SINK-AFTER-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]] ; SINK-AFTER: exit: ; SINK-AFTER-NEXT: ret i32 0 ; diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll index fd75177c0d106..c144cf0ca5c52 100644 --- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll +++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll @@ -11,9 +11,9 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { ; VF8UF1: [[CMP:%.+]] = icmp eq i64 %index.next, %n.vec ; VF8UF1-NEXT: br i1 [[CMP]], label %middle.block, label %vector.body ; -; VF8UF2: br i1 true, label %middle.block, label %vector.body +; VF8UF2: br label %middle.block ; -; VF16UF1: br i1 true, label %middle.block, label %vector.body +; VF16UF1: br label %middle.block ; entry: %and = and i64 %N, 15 diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll index b3ec3e8f0f3c6..010a7f3b73a21 100644 --- a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll +++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll @@ -498,18 +498,14 @@ define void @sext_of_i1_stride(i1 %g, ptr %dst) mustprogress { ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] ; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], [[G_64]] -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], [[G_64]] ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 0, [[G_64]] -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 0, [[G_64]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP3]], [[TMP8]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 -3 ; CHECK-NEXT: store <4 x i16> , ptr [[TMP7]], align 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -522,7 +518,7 @@ define void @sext_of_i1_stride(i1 %g, ptr %dst) mustprogress { ; CHECK-NEXT: store i16 [[G_16]], ptr [[GEP]], align 2 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], [[G_64]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 16 -; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -559,6 +555,5 @@ exit: ; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]} ; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} ; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]]} -; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} -; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]]} +; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]]} ;. From f0421c685f34b6b11234320a3eb41b7e8c618214 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 4 Nov 2024 13:19:14 +0000 Subject: [PATCH 03/25] !fixup after merge --- llvm/lib/Transforms/Vectorize/VPlan.cpp | 30 ++++------------ llvm/lib/Transforms/Vectorize/VPlan.h | 4 +-- .../RISCV/vplan-vp-intrinsics-reduction.ll | 1 + .../vplan-printing-before-execute.ll | 35 ++++++++----------- 4 files changed, 23 insertions(+), 47 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index a1ba3952620ca..8ef5d0253218a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -448,8 +448,10 @@ void VPBasicBlock::connectToPredecessors(BasicBlock *NewBB, // Set each forward successor here when it is created, excluding // backedges. A backward successor is set when the branch is created. unsigned idx = PredVPSuccessors.front() == this ? 0 : 1; - assert(!TermBr->getSuccessor(idx) && - "Trying to reset an existing successor block."); + assert( + (!TermBr->getSuccessor(idx) || + (isa(this) && TermBr->getSuccessor(idx) == NewBB)) && + "Trying to reset an existing successor block."); TermBr->setSuccessor(idx, NewBB); } CFG.DTU.applyUpdates({{DominatorTree::Insert, PredBB, NewBB}}); @@ -472,23 +474,7 @@ void VPIRBasicBlock::execute(VPTransformState *State) { "other blocks must be terminated by a branch"); } - for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) { - VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock(); - BasicBlock *PredBB = State->CFG.VPBB2IRBB[PredVPBB]; - assert(PredBB && "Predecessor basic-block not found building successor."); - LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n'); - - auto *PredBBTerminator = PredBB->getTerminator(); - auto *TermBr = cast(PredBBTerminator); - // Set each forward successor here when it is created, excluding - // backedges. A backward successor is set when the branch is created. - const auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors(); - unsigned idx = PredVPSuccessors.front() == this ? 0 : 1; - assert((!TermBr->getSuccessor(idx) || TermBr->getSuccessor(idx) == IRBB) && - "Trying to reset an existing successor block."); - TermBr->setSuccessor(idx, IRBB); - State->CFG.DTU.applyUpdates({{DominatorTree::Insert, PredBB, IRBB}}); - } + connectToPredecessors(IRBB, State->CFG); } void VPBasicBlock::execute(VPTransformState *State) { @@ -1039,11 +1025,9 @@ void VPlan::execute(VPTransformState *State) { // VPlan execution rather than earlier during VPlan construction. BasicBlock *MiddleBB = State->CFG.ExitBB; BasicBlock *ScalarPh = MiddleBB->getSingleSuccessor(); + VPBasicBlock *MiddleVPBB = getMiddleBlock(); replaceVPBBWithIRVPBB(getScalarPreheader(), ScalarPh); - if (getVectorLoopRegion()) { - VPBasicBlock *MiddleVPBB = getMiddleBlock(); - replaceVPBBWithIRVPBB(MiddleVPBB, MiddleBB); - } + replaceVPBBWithIRVPBB(MiddleVPBB, MiddleBB); // Disconnect the middle block from its single successor (the scalar loop // header) in both the CFG and DT. The branch will be recreated during VPlan diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index bfd6b4f375946..6f7f73a920a4b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3874,9 +3874,7 @@ class VPlan { } /// Returns the preheader of the vector loop region. - VPBasicBlock *getVectorPreheader() { - return cast(getVectorLoopRegion()->getSinglePredecessor()); - } + VPBasicBlock *getVectorPreheader() { return cast(getEntry()); } /// Returns the canonical induction recipe of the vector loop. VPCanonicalIVPHIRecipe *getCanonicalIV() { diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll index ca745d8b98cee..77a9d105c85f3 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll @@ -72,6 +72,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] ; IF-EVL-INLOOP-NEXT: IR %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ] ; IF-EVL-INLOOP: IR %exitcond.not = icmp eq i64 %iv.next, %n +; IF-EVL-INLOOP-NEXT: No successors ; IF-EVL-INLOOP-NEXT: } ; diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll index c9612ced3eee0..3d1947a38acc7 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll @@ -57,7 +57,6 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { ; ; CHECK: Executing best plan with VF=8, UF=2 ; CHECK-NEXT: VPlan 'Final VPlan for VF={8},UF={2}' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count ; CHECK-EMPTY: @@ -67,27 +66,21 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: -; CHECK-NEXT: Successor(s): vector loop +; CHECK-NEXT: Successor(s): vector.body ; CHECK-EMPTY: -; CHECK-NEXT: vector loop: { -; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> -; CHECK-NEXT: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> -; CHECK-NEXT: EMIT vp<[[PADD1:%.+]]> = ptradd ir<%A>, vp<[[STEPS1]]> -; CHECK-NEXT: vp<[[VPTR1:%.]]> = vector-pointer vp<[[PADD1]]> -; CHECK-NEXT: vp<[[VPTR2:%.]]> = vector-pointer vp<[[PADD1]]>, ir<1> -; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VPTR1]]> -; CHECK-NEXT: WIDEN ir<%l>.1 = load vp<[[VPTR2]]> -; CHECK-NEXT: WIDEN ir<%add> = add nsw ir<%l>, ir<10> -; CHECK-NEXT: WIDEN ir<%add>.1 = add nsw ir<%l>.1, ir<10> -; CHECK-NEXT: vp<[[VPTR3:%.+]]> = vector-pointer vp<[[PADD1]]> -; CHECK-NEXT: vp<[[VPTR4:%.+]]> = vector-pointer vp<[[PADD1]]>, ir<1> -; CHECK-NEXT: WIDEN store vp<[[VPTR3]]>, ir<%add> -; CHECK-NEXT: WIDEN store vp<[[VPTR4]]>, ir<%add>.1 -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV:%.+]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-cond ir -; CHECK-NEXT: No successors -; CHECK-NEXT: } +; CHECK-NEXT: vector.body: +; CHECK-NEXT: vp<[[STEPS1:%.+]]> = SCALAR-STEPS ir<0>, ir<1> +; CHECK-NEXT: EMIT vp<[[PADD1:%.+]]> = ptradd ir<%A>, vp<[[STEPS1]]> +; CHECK-NEXT: vp<[[VPTR1:%.]]> = vector-pointer vp<[[PADD1]]> +; CHECK-NEXT: vp<[[VPTR2:%.]]> = vector-pointer vp<[[PADD1]]>, ir<1> +; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VPTR1]]> +; CHECK-NEXT: WIDEN ir<%l>.1 = load vp<[[VPTR2]]> +; CHECK-NEXT: WIDEN ir<%add> = add nsw ir<%l>, ir<10> +; CHECK-NEXT: WIDEN ir<%add>.1 = add nsw ir<%l>.1, ir<10> +; CHECK-NEXT: vp<[[VPTR3:%.+]]> = vector-pointer vp<[[PADD1]]> +; CHECK-NEXT: vp<[[VPTR4:%.+]]> = vector-pointer vp<[[PADD1]]>, ir<1> +; CHECK-NEXT: WIDEN store vp<[[VPTR3]]>, ir<%add> +; CHECK-NEXT: WIDEN store vp<[[VPTR4]]>, ir<%add>.1 ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: From a4843b5e91d6da59a9e942db320a2f92ebe99438 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 4 Nov 2024 19:17:13 +0000 Subject: [PATCH 04/25] !fixup address latest comments, thanks! --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++-- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 7 +++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 934564fb75b04..554fd57a719d8 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7729,9 +7729,9 @@ DenseMap LoopVectorizationPlanner::executePlan( dyn_cast(BestVPlan.getEntry()->getSingleSuccessor())) { VPBasicBlock *HeaderVPBB = R->getEntryBasicBlock(); Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); - if (VectorizedLoopID) + if (VectorizedLoopID) { L->setLoopID(*VectorizedLoopID); - else { + } else { // Keep all loop hints from the original loop on the vector loop (we'll // replace the vectorizer-specific hints below). if (MDNode *LID = OrigLoop->getLoopID()) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 2869330078ae4..53ec6a4829e17 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -694,8 +694,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, SmallVector PossiblyDead(Term->operands()); Term->eraseFromParent(); - VPBasicBlock *Header = - cast(Plan.getVectorLoopRegion()->getEntry()); + auto *Header = cast(Plan.getVectorLoopRegion()->getEntry()); if (all_of(Header->phis(), [](VPRecipeBase &R) { return !isa(&R); })) { @@ -706,7 +705,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, } VPBlockBase *Preheader = Plan.getVectorLoopRegion()->getSinglePredecessor(); - VPBasicBlock *Exiting = + auto *Exiting = cast(Plan.getVectorLoopRegion()->getExiting()); auto *LoopRegion = Plan.getVectorLoopRegion(); @@ -741,7 +740,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, } /// Sink users of \p FOR after the recipe defining the previous value \p -// Previous of the recurrence. \returns true if all users of \p FOR could be +/// Previous of the recurrence. \returns true if all users of \p FOR could be /// re-arranged as needed or false if it is not possible. static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, From 706b68134f86fe8381d8244544dedefbecaea65a Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sun, 24 Nov 2024 08:05:03 +0000 Subject: [PATCH 05/25] !fixup address latest comments, thanks! --- llvm/lib/Transforms/Vectorize/VPlan.cpp | 8 +-- .../Transforms/Vectorize/VPlanTransforms.cpp | 18 +++-- .../LoopVectorize/RISCV/low-trip-count.ll | 72 ++++++++----------- .../LoopVectorize/RISCV/short-trip-count.ll | 40 +++++------ .../debugloc-optimize-vfuf-term.ll | 43 +++++------ 5 files changed, 77 insertions(+), 104 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 8b228520cea51..70c2f5f320af9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1054,8 +1054,8 @@ void VPlan::execute(VPTransformState *State) { if (isa(&R)) continue; - if (isa(&R) || - isa(&R)) { + if (isa( + &R)) { PHINode *Phi = nullptr; if (isa(&R)) { Phi = cast(State->get(R.getVPSingleValue())); @@ -1085,10 +1085,10 @@ void VPlan::execute(VPTransformState *State) { // previous iteration. For non-ordered reductions all UF parts are // generated. auto *PhiR = cast(&R); + auto *RedPhiR = dyn_cast(PhiR); bool NeedsScalar = isa(PhiR) || - (isa(PhiR) && - cast(PhiR)->isInLoop()); + (RedPhiR && RedPhiR->isInLoop()); Value *Phi = State->get(PhiR, NeedsScalar); Value *Val = State->get(PhiR->getBackedgeValue(), NeedsScalar); cast(Phi)->addIncoming(Val, VectorLatchBB); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 02f6fc8e907c0..5a64f6318a654 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -693,9 +693,14 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, SmallVector PossiblyDead(Term->operands()); Term->eraseFromParent(); auto *Header = cast(Plan.getVectorLoopRegion()->getEntry()); - if (all_of(Header->phis(), [](VPRecipeBase &R) { - return !isa(&R); - })) { + if (any_of(Header->phis(), + IsaPred)) { + LLVMContext &Ctx = SE.getContext(); + auto *BOC = new VPInstruction( + VPInstruction::BranchOnCond, + {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}, Term->getDebugLoc()); + ExitingVPBB->appendRecipe(BOC); + } else { for (VPRecipeBase &R : make_early_inc_range(Header->phis())) { auto *P = cast(&R); P->replaceAllUsesWith(P->getStartValue()); @@ -720,13 +725,6 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, // be deleted when the region is deleted. LoopRegion->clearEntry(); delete LoopRegion; - } else { - LLVMContext &Ctx = SE.getContext(); - auto *BOC = - new VPInstruction(VPInstruction::BranchOnCond, - {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}, Term->getDebugLoc()); - - ExitingVPBB->appendRecipe(BOC); } for (VPValue *Op : PossiblyDead) recursivelyDeleteDeadRecipes(Op); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll index f8937e47cd0df..a6da9ec9d0452 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll @@ -56,21 +56,17 @@ define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 3) -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i8.p0(ptr [[TMP6]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP7:%.*]] = shl [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i8.p0(ptr [[TMP9]], i32 1, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP10:%.*]] = shl [[WIDE_MASKED_LOAD]], splat (i8 1) -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 0 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv2i8.p0(ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: call void @llvm.masked.store.nxv2i8.p0( [[TMP13]], ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv2i8.p0( [[TMP13]], ptr [[TMP14]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -78,12 +74,12 @@ define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]] -; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP12]], 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] -; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP13]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] +; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP15]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST1]], i64 [[I_08]] +; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP16]] ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 3 @@ -125,21 +121,17 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 5) -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP6]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP7:%.*]] = shl [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP10:%.*]] = shl [[WIDE_MASKED_LOAD]], splat (i8 1) -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 0 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0( [[TMP13]], ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0( [[TMP13]], ptr [[TMP14]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -147,12 +139,12 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]] -; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP12]], 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] -; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP13]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] +; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP15]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST1]], i64 [[I_08]] +; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP16]] ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 5 @@ -194,21 +186,17 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 8) -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP6]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP7:%.*]] = shl [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP10:%.*]] = shl [[WIDE_MASKED_LOAD]], splat (i8 1) -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 0 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0( [[TMP13]], ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0( [[TMP13]], ptr [[TMP14]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -216,12 +204,12 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]] -; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP12]], 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] -; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP13]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] +; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP15]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST1]], i64 [[I_08]] +; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP16]] ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 8 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll index 692569bb2c860..8212d28f8aedd 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll @@ -12,18 +12,14 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP0]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i32(i32 [[TMP3]], i32 4) -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv1i32.p0(ptr [[TMP5]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 4) +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4:%.*]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv1i32.p0(ptr [[TMP7]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP6:%.*]] = add nsw [[WIDE_MASKED_LOAD]], splat (i32 1) -; CHECK-NEXT: call void @llvm.masked.store.nxv1i32.p0( [[TMP6]], ptr [[TMP5]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP2]] -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv1i32.p0( [[TMP6]], ptr [[TMP8]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -31,7 +27,7 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 [[IV]] ; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[GEP]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[V]], 1 ; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP]], align 4 @@ -71,18 +67,14 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-NEXT: [[TMP4:%.*]] = mul i32 [[TMP3]], 4 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 [[TMP5]], i32 4) -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP7]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6:%.*]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP9]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP8:%.*]] = add nsw [[WIDE_MASKED_LOAD]], splat (i32 1) -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP8]], ptr [[TMP7]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP4]] -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP8]], ptr [[TMP10]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -90,7 +82,7 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 [[IV]] ; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[GEP]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[V]], 1 ; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll b/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll index 04ce9562c04b5..22d1c61992b3f 100644 --- a/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll +++ b/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll @@ -9,24 +9,20 @@ define i32 @foo(ptr %p) { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ], !dbg [[DBG3:![0-9]+]] -; CHECK-NEXT: store i8 0, ptr [[P]], align 1, !dbg [[DBG7:![0-9]+]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2, !dbg [[DBG3]] -; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !dbg [[DBG3]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: store i8 0, ptr [[P]], align 1, !dbg [[DBG3:![0-9]+]] +; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]], !dbg [[DBG11:![0-9]+]] +; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]], !dbg [[DBG7:![0-9]+]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], !dbg [[DBG3]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], !dbg [[DBG8:![0-9]+]] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], !dbg [[DBG3]] -; CHECK-NEXT: [[CONV:%.*]] = trunc i64 0 to i8, !dbg [[DBG12:![0-9]+]] -; CHECK-NEXT: store i8 [[CONV]], ptr [[P]], align 1, !dbg [[DBG7]] -; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1, !dbg [[DBG13:![0-9]+]] -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1, !dbg [[DBG14:![0-9]+]] -; CHECK-NEXT: br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !dbg [[DBG11]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], !dbg [[DBG8]] +; CHECK-NEXT: [[CONV:%.*]] = trunc i64 0 to i8, !dbg [[DBG9:![0-9]+]] +; CHECK-NEXT: store i8 [[CONV]], ptr [[P]], align 1, !dbg [[DBG3]] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1, !dbg [[DBG10:![0-9]+]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1, !dbg [[DBG11:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !dbg [[DBG7]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret i32 0 ; @@ -63,17 +59,16 @@ exit: ; preds = %loop ;. ; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug) ; CHECK: [[META1]] = !DIFile(filename: "test.cpp", directory: {{.*}}) -; CHECK: [[DBG3]] = !DILocation(line: 4, scope: [[META4:![0-9]+]]) +; CHECK: [[DBG3]] = !DILocation(line: 6, scope: [[META4:![0-9]+]]) ; CHECK: [[META4]] = distinct !DISubprogram(name: "foo", scope: [[META1]], file: [[META1]], line: 11, type: [[META5:![0-9]+]], spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META6:![0-9]+]]) ; CHECK: [[META5]] = distinct !DISubroutineType(types: [[META6]]) ; CHECK: [[META6]] = !{} -; CHECK: [[DBG7]] = !DILocation(line: 6, scope: [[META4]]) -; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META9:![0-9]+]], [[META10:![0-9]+]]} -; CHECK: [[META9]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META10]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[DBG11]] = !DILocation(line: 9, scope: [[META4]]) -; CHECK: [[DBG12]] = !DILocation(line: 5, scope: [[META4]]) -; CHECK: [[DBG13]] = !DILocation(line: 7, scope: [[META4]]) -; CHECK: [[DBG14]] = !DILocation(line: 8, scope: [[META4]]) -; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META10]], [[META9]]} +; CHECK: [[DBG7]] = !DILocation(line: 9, scope: [[META4]]) +; CHECK: [[DBG8]] = !DILocation(line: 4, scope: [[META4]]) +; CHECK: [[DBG9]] = !DILocation(line: 5, scope: [[META4]]) +; CHECK: [[DBG10]] = !DILocation(line: 7, scope: [[META4]]) +; CHECK: [[DBG11]] = !DILocation(line: 8, scope: [[META4]]) +; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META13:![0-9]+]], [[META14:![0-9]+]]} +; CHECK: [[META13]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[META14]] = !{!"llvm.loop.isvectorized", i32 1} ;. From 71436fc905be3131651bca1a069370261a396c72 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sun, 24 Nov 2024 11:09:08 +0000 Subject: [PATCH 06/25] !fixup fix formatting --- llvm/lib/Transforms/Vectorize/VPlan.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 70c2f5f320af9..e3e1cc85636e0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -222,7 +222,8 @@ VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() { VPTransformState::VPTransformState(const TargetTransformInfo *TTI, ElementCount VF, unsigned UF, LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder, - InnerLoopVectorizer *ILV, VPlan *Plan, Type *CanonicalIVTy) + InnerLoopVectorizer *ILV, VPlan *Plan, + Type *CanonicalIVTy) : TTI(TTI), VF(VF), CFG(DT), LI(LI), Builder(Builder), ILV(ILV), Plan(Plan), LVer(nullptr), TypeAnalysis(CanonicalIVTy) {} From cc43362acee766a668c86c60142641e753522c70 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 7 Dec 2024 15:33:15 +0000 Subject: [PATCH 07/25] !fixup update after merge. --- llvm/lib/Transforms/Vectorize/VPlan.cpp | 24 ++++--------------- llvm/lib/Transforms/Vectorize/VPlan.h | 7 ++++-- .../Transforms/Vectorize/VPlanTransforms.cpp | 2 -- .../LoopVectorize/AArch64/call-costs.ll | 2 ++ .../LoopVectorize/RISCV/low-trip-count.ll | 23 ++++++++++++------ .../LoopVectorize/RISCV/short-trip-count.ll | 12 ++++++---- .../truncate-to-minimal-bitwidth-cost.ll | 2 ++ .../LoopVectorize/SystemZ/pr47665.ll | 4 ++-- ...demanding-all-lanes-and-first-lane-only.ll | 8 +++---- .../LoopVectorize/X86/constant-fold.ll | 2 ++ .../Transforms/LoopVectorize/X86/pr34438.ll | 2 ++ .../debugloc-optimize-vfuf-term.ll | 20 +++++++++------- .../LoopVectorize/first-order-recurrence.ll | 6 +++++ .../version-stride-with-integer-casts.ll | 2 ++ 14 files changed, 66 insertions(+), 50 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 48733ff9124f0..2d5c05309fb24 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -486,7 +486,7 @@ void VPBasicBlock::execute(VPTransformState *State) { }; // 1. Create an IR basic block. - if (this == getPlan()->getVectorPreheader() || + if (this == getPlan()->getEntry() || (Replica && this == getParent()->getEntry()) || IsReplicateRegion(getSingleHierarchicalPredecessor())) { // Reuse the previous basic block if the current VPBB is either @@ -1078,30 +1078,14 @@ void VPlan::execute(VPTransformState *State) { continue; } - // For canonical IV, first-order recurrences and in-order reduction phis, - // only a single part is generated, which provides the last part from the - // previous iteration. For non-ordered reductions all UF parts are - // generated. auto *PhiR = cast(&R); - auto *RedPhiR = dyn_cast(PhiR); - bool NeedsScalar = - isa(PhiR) || - (RedPhiR && RedPhiR->isInLoop()); + bool NeedsScalar = isa(PhiR) || + (isa(PhiR) && + cast(PhiR)->isInLoop()); Value *Phi = State->get(PhiR, NeedsScalar); Value *Val = State->get(PhiR->getBackedgeValue(), NeedsScalar); cast(Phi)->addIncoming(Val, VectorLatchBB); } -/*<<<<<<< HEAD*/ -/*=======*/ - - /*auto *PhiR = cast(&R);*/ - /*bool NeedsScalar = isa(PhiR) ||*/ - /*(isa(PhiR) &&*/ - /*cast(PhiR)->isInLoop());*/ - /*Value *Phi = State->get(PhiR, NeedsScalar);*/ - /*Value *Val = State->get(PhiR->getBackedgeValue(), NeedsScalar);*/ - /*cast(Phi)->addIncoming(Val, VectorLatchBB);*/ -/*>>>>>>> origin/main*/ } State->CFG.DTU.flush(); assert(State->CFG.DTU.getDomTree().verify( diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 1ffd198a129ef..b57054d8227a8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3872,7 +3872,10 @@ class VPlan { /// Returns the preheader of the vector loop region. VPBasicBlock *getVectorPreheader() { - return dyn_cast(getVectorLoopRegion()->getSinglePredecessor()); + auto *LoopRegion = getVectorLoopRegion(); + if (!LoopRegion) + return nullptr; + return dyn_cast(LoopRegion->getSinglePredecessor()); } /// Returns the VPRegionBlock of the vector loop. @@ -3880,7 +3883,7 @@ class VPlan { return dyn_cast(getEntry()->getSingleSuccessor()); } const VPRegionBlock *getVectorLoopRegion() const { - return cast(getEntry()->getSingleSuccessor()); + return dyn_cast(getEntry()->getSingleSuccessor()); } /// Returns the 'middle' block of the plan, that is the block that selects diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index fa3b9605de266..b1e2f26d1db65 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1849,8 +1849,6 @@ void VPlanTransforms::createInterleaveGroups( } void VPlanTransforms::prepareToExecute(VPlan &Plan) { - ReversePostOrderTraversal> RPOT( - Plan.getVectorLoopRegion()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( vp_depth_first_deep(Plan.getEntry()))) { for (VPRecipeBase &R : make_early_inc_range(VPBB->phis())) { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll index a01f179281223..e63155b024c43 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll @@ -78,6 +78,8 @@ define void @powi_call(ptr %P) { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll index 334aec6217681..10ac870c112ae 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll @@ -55,6 +55,8 @@ define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 3) ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 @@ -64,9 +66,9 @@ define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv2i8.p0(ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], [[WIDE_MASKED_LOAD1]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 ; CHECK-NEXT: call void @llvm.masked.store.nxv2i8.p0( [[TMP13]], ptr [[TMP14]], i32 1, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -120,6 +122,8 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 5) ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 @@ -129,9 +133,8 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0( [[TMP13]], ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0( [[TMP13]], ptr [[TMP14]], i32 1, [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -186,6 +189,8 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 8) ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 @@ -195,8 +200,8 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0( [[TMP13]], ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0( [[TMP13]], ptr [[TMP14]], i32 1, [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -243,6 +248,8 @@ define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 @@ -300,6 +307,8 @@ define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP1]], align 1 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll index 616b7c9af5b57..3386a7d3972aa 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll @@ -12,13 +12,15 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP0]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 4) ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4:%.*]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv1i32.p0(ptr [[TMP7]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP6:%.*]] = add nsw [[WIDE_MASKED_LOAD]], splat (i32 1) -; CHECK-NEXT: call void @llvm.masked.store.nxv1i32.p0( [[TMP6]], ptr [[TMP5]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP2]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv1i32.p0( [[TMP6]], ptr [[TMP8]], i32 4, [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -67,13 +69,15 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-NEXT: [[TMP4:%.*]] = mul i32 [[TMP3]], 4 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6:%.*]], i32 0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP9]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP8:%.*]] = add nsw [[WIDE_MASKED_LOAD]], splat (i32 1) -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP8]], ptr [[TMP7]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP8]], ptr [[TMP10]], i32 4, [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll index 9556c3653f37d..3dd5e70e8d5b1 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll @@ -163,6 +163,8 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 { ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i8> [[BROADCAST_SPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP0:%.*]] = trunc <2 x i8> [[BROADCAST_SPLAT]] to <2 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i1> splat (i1 true), [[TMP0]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 0, i32 2) ; CHECK-NEXT: [[TMP3:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP2]], <2 x i1> zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0 diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll index f7de8a6fd2be5..21063e32e81b6 100644 --- a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll +++ b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll @@ -17,6 +17,8 @@ define void @test(ptr %p, i40 %a) { ; CHECK-NEXT: [[TMP6:%.*]] = icmp ult <16 x i1> zeroinitializer, [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i1> [[TMP6]], splat (i1 true) ; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt <16 x i1> [[TMP7]], zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: ; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 @@ -113,8 +115,6 @@ define void @test(ptr %p, i40 %a) { ; CHECK-NEXT: store i1 [[TMP38]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE30]] ; CHECK: pred.store.continue30: -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE32:%.*]] -; CHECK: for.body.0: ; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll index fa39fc262ef5a..c912e18e74ecf 100644 --- a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll +++ b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll @@ -14,6 +14,8 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[TMP4:%.*]] = mul nsw i64 0, 4 ; CHECK-NEXT: [[TMP1:%.*]] = mul nsw i64 0, 4 ; CHECK-NEXT: [[TMP2:%.*]] = mul nsw i64 0, 4 @@ -23,11 +25,11 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i64 0 +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 1 ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr [[TMP8]], align 1 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i64 0 -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr [8 x i32], ptr @src, i64 0, i64 4 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i32 0 @@ -60,8 +62,6 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p ; CHECK-NEXT: store i32 [[TMP31]], ptr [[DST]], align 4 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]] ; CHECK: [[PRED_STORE_CONTINUE6]]: -; CHECK-NEXT: br label %[[THEN_0:.*]] -; CHECK: [[THEN_0]]: ; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll index 1e6e38632701f..83e2f84814add 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll @@ -15,6 +15,8 @@ define void @f1() { ; CHECK-NEXT: bb1: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: ; CHECK-NEXT: [[TMP0:%.*]] = sext i16 0 to i64 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr [2 x ptr], ptr @b, i16 0, i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr ptr, ptr [[TMP1]], i32 0 diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll index 47662d155b545..7816c4918761f 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll @@ -14,6 +14,8 @@ define void @small_tc(ptr noalias nocapture %A, ptr noalias nocapture readonly % ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll b/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll index 22d1c61992b3f..0f34f6243f155 100644 --- a/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll +++ b/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll @@ -9,20 +9,22 @@ define i32 @foo(ptr %p) { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: store i8 0, ptr [[P]], align 1, !dbg [[DBG3:![0-9]+]] -; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] +; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]], !dbg [[DBG7:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]], !dbg [[DBG7:![0-9]+]] +; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]], !dbg [[DBG8:![0-9]+]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], !dbg [[DBG8:![0-9]+]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], !dbg [[DBG9:![0-9]+]] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], !dbg [[DBG8]] -; CHECK-NEXT: [[CONV:%.*]] = trunc i64 0 to i8, !dbg [[DBG9:![0-9]+]] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], !dbg [[DBG9]] +; CHECK-NEXT: [[CONV:%.*]] = trunc i64 0 to i8, !dbg [[DBG7]] ; CHECK-NEXT: store i8 [[CONV]], ptr [[P]], align 1, !dbg [[DBG3]] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1, !dbg [[DBG10:![0-9]+]] ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1, !dbg [[DBG11:![0-9]+]] -; CHECK-NEXT: br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !dbg [[DBG7]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !dbg [[DBG8]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret i32 0 ; @@ -63,9 +65,9 @@ exit: ; preds = %loop ; CHECK: [[META4]] = distinct !DISubprogram(name: "foo", scope: [[META1]], file: [[META1]], line: 11, type: [[META5:![0-9]+]], spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META6:![0-9]+]]) ; CHECK: [[META5]] = distinct !DISubroutineType(types: [[META6]]) ; CHECK: [[META6]] = !{} -; CHECK: [[DBG7]] = !DILocation(line: 9, scope: [[META4]]) -; CHECK: [[DBG8]] = !DILocation(line: 4, scope: [[META4]]) -; CHECK: [[DBG9]] = !DILocation(line: 5, scope: [[META4]]) +; CHECK: [[DBG7]] = !DILocation(line: 5, scope: [[META4]]) +; CHECK: [[DBG8]] = !DILocation(line: 9, scope: [[META4]]) +; CHECK: [[DBG9]] = !DILocation(line: 4, scope: [[META4]]) ; CHECK: [[DBG10]] = !DILocation(line: 7, scope: [[META4]]) ; CHECK: [[DBG11]] = !DILocation(line: 8, scope: [[META4]]) ; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META13:![0-9]+]], [[META14:![0-9]+]]} diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll index 4d8f2b8e21dd7..2da5daef61562 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -3705,6 +3705,8 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) { ; UNROLL-NO-IC-NEXT: entry: ; UNROLL-NO-IC-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; UNROLL-NO-IC: vector.ph: +; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] +; UNROLL-NO-IC: vector.body: ; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4 ; UNROLL-NO-IC-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; UNROLL-NO-IC: middle.block: @@ -3727,6 +3729,8 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) { ; UNROLL-NO-VF-NEXT: entry: ; UNROLL-NO-VF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; UNROLL-NO-VF: vector.ph: +; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] +; UNROLL-NO-VF: vector.body: ; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4 ; UNROLL-NO-VF-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; UNROLL-NO-VF: middle.block: @@ -3749,6 +3753,8 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) { ; SINK-AFTER-NEXT: entry: ; SINK-AFTER-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SINK-AFTER: vector.ph: +; SINK-AFTER-NEXT: br label [[VECTOR_BODY:%.*]] +; SINK-AFTER: vector.body: ; SINK-AFTER-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4 ; SINK-AFTER-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; SINK-AFTER: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll index 69c541ba6bbbb..1137456779eec 100644 --- a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll +++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll @@ -497,6 +497,8 @@ define void @sext_of_i1_stride(i1 %g, ptr %dst) mustprogress { ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] ; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], [[G_64]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 0, [[G_64]] ; CHECK-NEXT: [[TMP8:%.*]] = mul i64 0, [[G_64]] ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP3]], [[TMP8]] From e758945a0b4f36c8f5d3e6c2039ed295587f85b4 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 7 Dec 2024 15:39:18 +0000 Subject: [PATCH 08/25] !fixup remove unrelated changes. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- llvm/lib/Transforms/Vectorize/VPlan.cpp | 2 +- llvm/lib/Transforms/Vectorize/VPlan.h | 2 -- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d5beb47a29b26..67660313a4576 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2999,7 +2999,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { getOrCreateVectorTripCount(nullptr), LoopMiddleBlock, State); } - if (!isa(State.Plan->getEntry()->getSingleSuccessor())) + if (!State.Plan->getVectorLoopRegion()) return; for (Instruction *PI : PredicatedInstructions) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 2d5c05309fb24..479604e616e69 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1009,8 +1009,8 @@ void VPlan::execute(VPTransformState *State) { // skeleton creation, so we can only create the VPIRBasicBlocks now during // VPlan execution rather than earlier during VPlan construction. BasicBlock *MiddleBB = State->CFG.ExitBB; - BasicBlock *ScalarPh = MiddleBB->getSingleSuccessor(); VPBasicBlock *MiddleVPBB = getMiddleBlock(); + BasicBlock *ScalarPh = MiddleBB->getSingleSuccessor(); replaceVPBBWithIRVPBB(getScalarPreheader(), ScalarPh); replaceVPBBWithIRVPBB(MiddleVPBB, MiddleBB); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index b57054d8227a8..150e398da3aed 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3592,8 +3592,6 @@ class VPBasicBlock : public VPBlockBase { protected: /// Execute the recipes in the IR basic block \p BB. void executeRecipes(VPTransformState *State, BasicBlock *BB); - void connectToPredecessors(BasicBlock *NewBB, - VPTransformState::CFGState &CFG); /// Connect the VPBBs predecessors' in the VPlan CFG to the IR basic block /// generated for this VPBB. From 1d4b2e64f42e572fc73e1d20753ab68b04c58b14 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 17 Dec 2024 21:26:13 +0000 Subject: [PATCH 09/25] !fixup partial fixup after merging main. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 8 ++++---- llvm/lib/Transforms/Vectorize/VPlan.cpp | 17 +++++++++++++++-- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index bb0cab961fa0b..53d3d593412a7 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2471,6 +2471,8 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { /// scalar preheader. static void introduceCheckBlockInVPlan(VPlan &Plan, BasicBlock *CheckIRBB) { VPBlockBase *ScalarPH = Plan.getScalarPreheader(); + // FIXME: Cannot get the vector preheader at the moment if the vector loop + // region has been removed. VPBlockBase *VectorPH = Plan.getVectorPreheader(); VPBlockBase *PreVectorPH = VectorPH->getSinglePredecessor(); if (PreVectorPH->getNumSuccessors() != 1) { @@ -7862,8 +7864,7 @@ DenseMap LoopVectorizationPlanner::executePlan( makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, LLVMLoopVectorizeFollowupVectorized}); - if (auto *R = - dyn_cast(BestVPlan.getEntry()->getSingleSuccessor())) { + if (auto *R = BestVPlan.getVectorLoopRegion()) { VPBasicBlock *HeaderVPBB = R->getEntryBasicBlock(); Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); if (VectorizedLoopID) { @@ -7890,8 +7891,7 @@ DenseMap LoopVectorizationPlanner::executePlan( ILV.printDebugTracesAtEnd(); // 4. Adjust branch weight of the branch in the middle block. - if (auto *R = - dyn_cast(BestVPlan.getEntry()->getSingleSuccessor())) { + if (auto *R = BestVPlan.getVectorLoopRegion()) { auto *ExitVPBB = cast(R->getSingleSuccessor()); auto *MiddleTerm = diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 52f77a4598cc5..11a71820c22a2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1022,8 +1022,7 @@ void VPlan::execute(VPTransformState *State) { for (VPBlockBase *Block : RPOT) Block->execute(State); - if (auto *LoopRegion = - dyn_cast(getEntry()->getSingleSuccessor())) { + if (auto *LoopRegion = getVectorLoopRegion()) { VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock(); BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB]; @@ -1047,6 +1046,19 @@ void VPlan::execute(VPTransformState *State) { Phi = cast(GEP->getPointerOperand()); } + Phi->setIncomingBlock(1, VectorLatchBB); + + // Move the last step to the end of the latch block. This ensures + // consistent placement of all induction updates. + Instruction *Inc = cast(Phi->getIncomingValue(1)); + Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode()); + + // Use the steps for the last part as backedge value for the induction. + if (auto *IV = dyn_cast(&R)) + Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand())); + continue; + } + auto *PhiR = cast(&R); bool NeedsScalar = isa(PhiR) || (isa(PhiR) && @@ -1056,6 +1068,7 @@ void VPlan::execute(VPTransformState *State) { cast(Phi)->addIncoming(Val, VectorLatchBB); } } + State->CFG.DTU.flush(); } From 0c76e9d4954ed542279486320330fc51b9253461 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 23 Dec 2024 21:21:03 +0000 Subject: [PATCH 10/25] !fixup update on top of current main, iterate to find vector.ph --- .../Transforms/Vectorize/LoopVectorize.cpp | 12 ++++---- llvm/lib/Transforms/Vectorize/VPlan.cpp | 29 ++++++++++++------ llvm/lib/Transforms/Vectorize/VPlan.h | 8 ++--- .../Transforms/Vectorize/VPlanTransforms.cpp | 5 ++++ .../truncate-to-minimal-bitwidth-cost.ll | 4 +-- .../LoopVectorize/SystemZ/pr47665.ll | 30 +++++++++---------- ...demanding-all-lanes-and-first-lane-only.ll | 30 ++++++++++--------- .../vplan-printing-before-execute.ll | 4 +-- 8 files changed, 68 insertions(+), 54 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 11fbb49d4dd6c..e57eb58eaa040 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2406,12 +2406,12 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, // End if-block. VPRegionBlock *Parent = RepRecipe->getParent()->getParent(); bool IfPredicateInstr = Parent ? Parent->isReplicator() : false; - assert((Parent || all_of(RepRecipe->operands(), - [](VPValue *Op) { - return Op->isDefinedOutsideLoopRegions(); - })) && - "Expected a recipe is either within a region or all of its operands " - "are defined outside the vectorized region."); + assert( + (Parent || !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() || + all_of(RepRecipe->operands(), + [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) && + "Expected a recipe is either within a region or all of its operands " + "are defined outside the vectorized region."); if (IfPredicateInstr) PredicatedInstructions.push_back(Cloned); } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 197d728928838..3664a5aee7b6f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1081,19 +1081,28 @@ InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) { return getVectorLoopRegion()->cost(VF, Ctx); } +VPBasicBlock *VPlan::getVectorPreheader() { + VPBlockBase *Current = getEntry()->getSuccessors().back(); + while (Current->getNumSuccessors() == 2) + Current = Current->getSuccessors().back(); + return cast(Current); +} + +VPBasicBlock *VPlan::getVectorPreheader() const { + VPBlockBase *Current = getEntry()->getSuccessors().back(); + while (Current->getNumSuccessors() == 2) + Current = Current->getSuccessors().back(); + return cast(Current); +} + VPRegionBlock *VPlan::getVectorLoopRegion() { // TODO: Cache if possible. - for (VPBlockBase *B : vp_depth_first_shallow(getEntry())) - if (auto *R = dyn_cast(B)) - return R; - return nullptr; + return dyn_cast(getVectorPreheader()->getSingleSuccessor()); + ; } const VPRegionBlock *VPlan::getVectorLoopRegion() const { - for (const VPBlockBase *B : vp_depth_first_shallow(getEntry())) - if (auto *R = dyn_cast(B)) - return R; - return nullptr; + return dyn_cast(getVectorPreheader()->getSingleSuccessor()); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1415,8 +1424,10 @@ void VPlanIngredient::print(raw_ostream &O) const { #endif bool VPValue::isDefinedOutsideLoopRegions() const { + return !hasDefiningRecipe() || - !getDefiningRecipe()->getParent()->getEnclosingLoopRegion(); + (!getDefiningRecipe()->getParent()->getEnclosingLoopRegion() && + getDefiningRecipe()->getParent()->getPlan()->getVectorLoopRegion()); } void VPValue::replaceAllUsesWith(VPValue *New) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index d2297101ab305..87a3884404fed 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3869,12 +3869,8 @@ class VPlan { const VPBasicBlock *getEntry() const { return Entry; } /// Returns the preheader of the vector loop region. - VPBasicBlock *getVectorPreheader() { - auto *LoopRegion = getVectorLoopRegion(); - if (!LoopRegion) - return nullptr; - return dyn_cast(LoopRegion->getSinglePredecessor()); - } + VPBasicBlock *getVectorPreheader(); + VPBasicBlock *getVectorPreheader() const; /// Returns the VPRegionBlock of the vector loop. VPRegionBlock *getVectorLoopRegion(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index f227f83fdeb6c..a7097dd193447 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -718,6 +718,11 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, Header->setParent(nullptr); Exiting->setParent(nullptr); + + for (VPBlockBase *B : vp_depth_first_shallow(LoopRegion->getEntry())) { + if (isa(B)) + B->setParent(nullptr); + } VPBlockUtils::connectBlocks(Preheader, Header); VPBlockUtils::connectBlocks(Exiting, Middle); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll index 3dd5e70e8d5b1..7a6676ff6ea1b 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll @@ -163,12 +163,12 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 { ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i8> [[BROADCAST_SPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP0:%.*]] = trunc <2 x i8> [[BROADCAST_SPLAT]] to <2 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i1> splat (i1 true), [[TMP0]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT3]], <2 x ptr> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 0, i32 2) ; CHECK-NEXT: [[TMP3:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP2]], <2 x i1> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT3]], <2 x ptr> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> zeroinitializer, <2 x ptr> [[BROADCAST_SPLAT4]], i32 1, <2 x i1> [[TMP3]]) ; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; CHECK: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll index 21063e32e81b6..2de0f7e4d4016 100644 --- a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll +++ b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll @@ -27,91 +27,91 @@ define void @test(ptr %p, i40 %a) { ; CHECK: pred.store.continue: ; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] ; CHECK: pred.store.if1: -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x i1> [[TMP8]], i32 1 ; CHECK-NEXT: store i1 [[TMP9]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]] ; CHECK: pred.store.continue2: ; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] ; CHECK: pred.store.if3: -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x i1> [[TMP8]], i32 2 ; CHECK-NEXT: store i1 [[TMP12]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] ; CHECK: pred.store.continue4: ; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] ; CHECK: pred.store.if5: -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x i1> [[TMP8]], i32 3 ; CHECK-NEXT: store i1 [[TMP14]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] ; CHECK: pred.store.continue6: ; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] ; CHECK: pred.store.if7: -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x i1> [[TMP8]], i32 4 ; CHECK-NEXT: store i1 [[TMP16]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]] ; CHECK: pred.store.continue8: ; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] ; CHECK: pred.store.if9: -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i1> [[TMP8]], i32 5 ; CHECK-NEXT: store i1 [[TMP18]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE10]] ; CHECK: pred.store.continue10: ; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]] ; CHECK: pred.store.if11: -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <16 x i1> [[TMP8]], i32 6 ; CHECK-NEXT: store i1 [[TMP20]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE12]] ; CHECK: pred.store.continue12: ; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]] ; CHECK: pred.store.if13: -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP8]], i32 7 ; CHECK-NEXT: store i1 [[TMP22]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE14]] ; CHECK: pred.store.continue14: ; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]] ; CHECK: pred.store.if15: -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <16 x i1> [[TMP8]], i32 8 ; CHECK-NEXT: store i1 [[TMP24]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE16]] ; CHECK: pred.store.continue16: ; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]] ; CHECK: pred.store.if17: -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <16 x i1> [[TMP8]], i32 9 ; CHECK-NEXT: store i1 [[TMP26]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE18]] ; CHECK: pred.store.continue18: ; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]] ; CHECK: pred.store.if19: -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i1> [[TMP8]], i32 10 ; CHECK-NEXT: store i1 [[TMP28]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE20]] ; CHECK: pred.store.continue20: ; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]] ; CHECK: pred.store.if21: -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x i1> [[TMP8]], i32 11 ; CHECK-NEXT: store i1 [[TMP30]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE22]] ; CHECK: pred.store.continue22: ; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]] ; CHECK: pred.store.if23: -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP8]], i32 12 ; CHECK-NEXT: store i1 [[TMP32]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE24]] ; CHECK: pred.store.continue24: ; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]] ; CHECK: pred.store.if25: -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i1> [[TMP8]], i32 13 ; CHECK-NEXT: store i1 [[TMP34]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE26]] ; CHECK: pred.store.continue26: ; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]] ; CHECK: pred.store.if27: -; CHECK-NEXT: [[TMP36:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <16 x i1> [[TMP8]], i32 14 ; CHECK-NEXT: store i1 [[TMP36]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE28]] ; CHECK: pred.store.continue28: ; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]] ; CHECK: pred.store.if29: -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP38:%.*]] = extractelement <16 x i1> [[TMP8]], i32 15 ; CHECK-NEXT: store i1 [[TMP38]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE30]] ; CHECK: pred.store.continue30: diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll index c912e18e74ecf..61bcbaa1fe4d2 100644 --- a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll +++ b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll @@ -16,20 +16,22 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[TMP4:%.*]] = mul nsw i64 0, 4 -; CHECK-NEXT: [[TMP1:%.*]] = mul nsw i64 0, 4 -; CHECK-NEXT: [[TMP2:%.*]] = mul nsw i64 0, 4 ; CHECK-NEXT: [[TMP3:%.*]] = mul nsw i64 0, 4 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP4:%.*]] = mul nsw i64 1, 4 +; CHECK-NEXT: [[TMP2:%.*]] = mul nsw i64 2, 4 +; CHECK-NEXT: [[TMP15:%.*]] = mul nsw i64 3, 4 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP15]] ; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[TMP8]], align 1 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i64 0 -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 1 -; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr [[TMP5]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP6]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr [[TMP7]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i8> [[TMP13]], i8 [[TMP10]], i32 2 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> [[TMP14]], i8 [[TMP11]], i32 3 ; CHECK-NEXT: [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr [8 x i32], ptr @src, i64 0, i64 4 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i32 0 @@ -44,21 +46,21 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[TMP20]], i32 1 ; CHECK-NEXT: br i1 [[TMP26]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]] ; CHECK: [[PRED_STORE_IF1]]: -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 1 ; CHECK-NEXT: store i32 [[TMP27]], ptr [[DST]], align 4 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]] ; CHECK: [[PRED_STORE_CONTINUE2]]: ; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i1> [[TMP20]], i32 2 ; CHECK-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] ; CHECK: [[PRED_STORE_IF3]]: -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 2 ; CHECK-NEXT: store i32 [[TMP29]], ptr [[DST]], align 4 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE4]] ; CHECK: [[PRED_STORE_CONTINUE4]]: ; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP20]], i32 3 ; CHECK-NEXT: br i1 [[TMP30]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]] ; CHECK: [[PRED_STORE_IF5]]: -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3 ; CHECK-NEXT: store i32 [[TMP31]], ptr [[DST]], align 4 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]] ; CHECK: [[PRED_STORE_CONTINUE6]]: diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll index c1ac6f85b0b64..ba728c6be8ab6 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll @@ -70,7 +70,7 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { ; CHECK-NEXT: IR %n.vec = sub i64 %and, %n.mod.vf ; CHECK-NEXT: IR %ind.end = sub i64 %and, %n.vec ; CHECK-NEXT: IR %ind.end1 = getelementptr i8, ptr %A, i64 %n.vec -; CHECK-NEXT: Successor(s): vector loop +; CHECK-NEXT: Successor(s): vector.body ; CHECK-EMPTY: ; CHECK-NEXT: vector.body: ; CHECK-NEXT: vp<[[STEPS1:%.+]]> = SCALAR-STEPS ir<0>, ir<1> @@ -85,7 +85,7 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { ; CHECK-NEXT: vp<[[VPTR4:%.+]]> = vector-pointer vp<[[PADD1]]>, ir<1> ; CHECK-NEXT: WIDEN store vp<[[VPTR3]]>, ir<%add> ; CHECK-NEXT: WIDEN store vp<[[VPTR4]]>, ir<%add>.1 -; CHECK-NEXT: Successor(s): middle.block +; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: ; CHECK-NEXT: EMIT vp<[[C:%.+]]> = icmp eq vp<[[TC]]>, ir<[[VTC]]> From dd45cad20284589bbe26db0c64bf8d1ad3210e91 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 18 Dec 2024 14:33:49 +0000 Subject: [PATCH 11/25] [VPlan] Manage created blocks directly in VPlan. (NFC) This patch changes the way blocks are managed by VPlan. Previously all blocks reachable from entry would be cleaned up when a VPlan is destroyed. With this patch, each VPlan keeps track of blocks created for it in a list and this list is then used to delete all blocks in the list when the VPlan is destroyed. To do so, block creation is funneled through helpers in directly in VPlan. The main advantage of doing so is it simplifies CFG transformations, as those do not have to take care of deleting any blocks, just adjusting the CFG. This helps to simplify https://github.com/llvm/llvm-project/pull/108378 and https://github.com/llvm/llvm-project/pull/106748. This also simplifies handling of 'immutable' blocks a VPlan holds references to, which at the moment only include the scalar header block. Note that the original constructors taking VPBlockBase are retained at the moment for unit tests. --- .../Transforms/Vectorize/LoopVectorize.cpp | 7 +- llvm/lib/Transforms/Vectorize/VPlan.cpp | 84 ++++++++++++------- llvm/lib/Transforms/Vectorize/VPlan.h | 54 ++++++------ .../Transforms/Vectorize/VPlanHCFGBuilder.cpp | 4 +- .../Transforms/Vectorize/VPlanTransforms.cpp | 19 +++-- 5 files changed, 97 insertions(+), 71 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 355ff40ce770e..4f2bf097d0cde 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2477,7 +2477,7 @@ static void introduceCheckBlockInVPlan(VPlan &Plan, BasicBlock *CheckIRBB) { assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors"); assert(PreVectorPH->getSuccessors()[0] == ScalarPH && "Unexpected successor"); - VPIRBasicBlock *CheckVPIRBB = VPIRBasicBlock::fromBasicBlock(CheckIRBB); + VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB); VPBlockUtils::insertOnEdge(PreVectorPH, VectorPH, CheckVPIRBB); PreVectorPH = CheckVPIRBB; } @@ -8189,11 +8189,10 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( // A new entry block has been created for the epilogue VPlan. Hook it in, as // otherwise we would try to modify the entry to the main vector loop. - VPIRBasicBlock *NewEntry = VPIRBasicBlock::fromBasicBlock(Insert); + VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Insert); VPBasicBlock *OldEntry = Plan.getEntry(); VPBlockUtils::reassociateBlocks(OldEntry, NewEntry); Plan.setEntry(NewEntry); - delete OldEntry; introduceCheckBlockInVPlan(Plan, Insert); return Insert; @@ -9463,7 +9462,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { VPBB->appendRecipe(Recipe); } - VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); + VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB); VPBB = cast(VPBB->getSingleSuccessor()); } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 9a082921d4f7f..e03847cea131f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -205,11 +205,6 @@ VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() { return Parent->getEnclosingBlockWithPredecessors(); } -void VPBlockBase::deleteCFG(VPBlockBase *Entry) { - for (VPBlockBase *Block : to_vector(vp_depth_first_shallow(Entry))) - delete Block; -} - VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() { iterator It = begin(); while (It != end() && It->isPhi()) @@ -474,6 +469,16 @@ void VPIRBasicBlock::execute(VPTransformState *State) { connectToPredecessors(State->CFG); } +VPIRBasicBlock *VPIRBasicBlock::clone() { + auto *NewBlock = getPlan()->createVPIRBasicBlock(IRBB); + for (VPRecipeBase &R : make_early_inc_range(*NewBlock)) + R.eraseFromParent(); + + for (VPRecipeBase &R : Recipes) + NewBlock->appendRecipe(R.clone()); + return NewBlock; +} + void VPBasicBlock::execute(VPTransformState *State) { bool Replica = bool(State->Lane); BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible. @@ -523,6 +528,13 @@ void VPBasicBlock::dropAllReferences(VPValue *NewValue) { } } +VPBasicBlock *VPBasicBlock::clone() { + auto *NewBlock = getPlan()->createVPBasicBlock(getName()); + for (VPRecipeBase &R : *this) + NewBlock->appendRecipe(R.clone()); + return NewBlock; +} + void VPBasicBlock::executeRecipes(VPTransformState *State, BasicBlock *BB) { LLVM_DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName() << " in BB:" << BB->getName() << '\n'); @@ -541,7 +553,7 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) { SmallVector Succs(successors()); // Create new empty block after the block to split. - auto *SplitBlock = new VPBasicBlock(getName() + ".split"); + auto *SplitBlock = getPlan()->createVPBasicBlock(getName() + ".split"); VPBlockUtils::insertBlockAfter(SplitBlock, this); // Finally, move the recipes starting at SplitAt to new block. @@ -701,8 +713,8 @@ static std::pair cloneFrom(VPBlockBase *Entry) { VPRegionBlock *VPRegionBlock::clone() { const auto &[NewEntry, NewExiting] = cloneFrom(getEntry()); - auto *NewRegion = - new VPRegionBlock(NewEntry, NewExiting, getName(), isReplicator()); + auto *NewRegion = getPlan()->createVPRegionBlock(NewEntry, NewExiting, + getName(), isReplicator()); for (VPBlockBase *Block : vp_depth_first_shallow(NewEntry)) Block->setParent(NewRegion); return NewRegion; @@ -822,17 +834,20 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent, #endif VPlan::VPlan(Loop *L) { - setEntry(VPIRBasicBlock::fromBasicBlock(L->getLoopPreheader())); - ScalarHeader = VPIRBasicBlock::fromBasicBlock(L->getHeader()); + setEntry(createVPIRBasicBlock(L->getLoopPreheader())); + ScalarHeader = createVPIRBasicBlock(L->getHeader()); } VPlan::~VPlan() { if (Entry) { VPValue DummyValue; - for (VPBlockBase *Block : vp_depth_first_shallow(Entry)) - Block->dropAllReferences(&DummyValue); - VPBlockBase::deleteCFG(Entry); + for (auto *VPB : reverse(CreatedBlocks)) + VPB->dropAllReferences(&DummyValue); + + for (auto *VPB : reverse(CreatedBlocks)) { + delete VPB; + } } for (VPValue *VPV : VPLiveInsToFree) delete VPV; @@ -840,14 +855,6 @@ VPlan::~VPlan() { delete BackedgeTakenCount; } -VPIRBasicBlock *VPIRBasicBlock::fromBasicBlock(BasicBlock *IRBB) { - auto *VPIRBB = new VPIRBasicBlock(IRBB); - for (Instruction &I : - make_range(IRBB->begin(), IRBB->getTerminator()->getIterator())) - VPIRBB->appendRecipe(new VPIRInstruction(I)); - return VPIRBB; -} - VPlanPtr VPlan::createInitialVPlan(Type *InductionTy, PredicatedScalarEvolution &PSE, bool RequiresScalarEpilogueCheck, @@ -861,7 +868,7 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy, // an epilogue vector loop, the original entry block here will be replaced by // a new VPIRBasicBlock wrapping the entry to the epilogue vector loop after // generating code for the main vector loop. - VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph"); + VPBasicBlock *VecPreheader = Plan->createVPBasicBlock("vector.ph"); VPBlockUtils::connectBlocks(Plan->getEntry(), VecPreheader); // Create SCEV and VPValue for the trip count. @@ -878,17 +885,17 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy, // Create VPRegionBlock, with empty header and latch blocks, to be filled // during processing later. - VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); - VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); + VPBasicBlock *HeaderVPBB = Plan->createVPBasicBlock("vector.body"); + VPBasicBlock *LatchVPBB = Plan->createVPBasicBlock("vector.latch"); VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); - auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop", - false /*isReplicator*/); + auto *TopRegion = Plan->createVPRegionBlock( + HeaderVPBB, LatchVPBB, "vector loop", false /*isReplicator*/); VPBlockUtils::insertBlockAfter(TopRegion, VecPreheader); - VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); + VPBasicBlock *MiddleVPBB = Plan->createVPBasicBlock("middle.block"); VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); - VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph"); + VPBasicBlock *ScalarPH = Plan->createVPBasicBlock("scalar.ph"); VPBlockUtils::connectBlocks(ScalarPH, ScalarHeader); if (!RequiresScalarEpilogueCheck) { VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH); @@ -904,7 +911,7 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy, // we unconditionally branch to the scalar preheader. Do nothing. // 3) Otherwise, construct a runtime check. BasicBlock *IRExitBlock = TheLoop->getUniqueLatchExitBlock(); - auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock); + auto *VPExitBlock = Plan->createVPIRBasicBlock(IRExitBlock); // The connection order corresponds to the operands of the conditional branch. VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB); VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH); @@ -960,15 +967,13 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, /// have a single predecessor, which is rewired to the new VPIRBasicBlock. All /// successors of VPBB, if any, are rewired to the new VPIRBasicBlock. static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) { - VPIRBasicBlock *IRVPBB = VPIRBasicBlock::fromBasicBlock(IRBB); + VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB); for (auto &R : make_early_inc_range(*VPBB)) { assert(!R.isPhi() && "Tried to move phi recipe to end of block"); R.moveBefore(*IRVPBB, IRVPBB->end()); } VPBlockUtils::reassociateBlocks(VPBB, IRVPBB); - - delete VPBB; } /// Generate the code inside the preheader and body of the vectorized loop. @@ -1217,6 +1222,7 @@ static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry, } VPlan *VPlan::duplicate() { + unsigned CreatedBlockSize = CreatedBlocks.size(); // Clone blocks. const auto &[NewEntry, __] = cloneFrom(Entry); @@ -1257,9 +1263,23 @@ VPlan *VPlan::duplicate() { assert(Old2NewVPValues.contains(TripCount) && "TripCount must have been added to Old2NewVPValues"); NewPlan->TripCount = Old2NewVPValues[TripCount]; + + for (unsigned I = CreatedBlockSize; I != CreatedBlocks.size(); ++I) + NewPlan->CreatedBlocks.push_back(CreatedBlocks[I]); + CreatedBlocks.truncate(CreatedBlockSize); + return NewPlan; } +VPIRBasicBlock *VPlan::createVPIRBasicBlock(BasicBlock *IRBB) { + auto *VPIRBB = new VPIRBasicBlock(IRBB); + for (Instruction &I : + make_range(IRBB->begin(), IRBB->getTerminator()->getIterator())) + VPIRBB->appendRecipe(new VPIRInstruction(I)); + CreatedBlocks.push_back(VPIRBB); + return VPIRBB; +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) Twine VPlanPrinter::getUID(const VPBlockBase *Block) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index e2c0ff7954675..eb0e3baa8d4f3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -636,9 +636,6 @@ class VPBlockBase { /// Return the cost of the block. virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx) = 0; - /// Delete all blocks reachable from a given VPBlockBase, inclusive. - static void deleteCFG(VPBlockBase *Entry); - /// Return true if it is legal to hoist instructions into this block. bool isLegalToHoistInto() { // There are currently no constraints that prevent an instruction to be @@ -3638,12 +3635,7 @@ class VPBasicBlock : public VPBlockBase { /// Clone the current block and it's recipes, without updating the operands of /// the cloned recipes. - VPBasicBlock *clone() override { - auto *NewBlock = new VPBasicBlock(getName()); - for (VPRecipeBase &R : *this) - NewBlock->appendRecipe(R.clone()); - return NewBlock; - } + VPBasicBlock *clone() override; protected: /// Execute the recipes in the IR basic block \p BB. @@ -3679,20 +3671,11 @@ class VPIRBasicBlock : public VPBasicBlock { return V->getVPBlockID() == VPBlockBase::VPIRBasicBlockSC; } - /// Create a VPIRBasicBlock from \p IRBB containing VPIRInstructions for all - /// instructions in \p IRBB, except its terminator which is managed in VPlan. - static VPIRBasicBlock *fromBasicBlock(BasicBlock *IRBB); - /// The method which generates the output IR instructions that correspond to /// this VPBasicBlock, thereby "executing" the VPlan. void execute(VPTransformState *State) override; - VPIRBasicBlock *clone() override { - auto *NewBlock = new VPIRBasicBlock(IRBB); - for (VPRecipeBase &R : Recipes) - NewBlock->appendRecipe(R.clone()); - return NewBlock; - } + VPIRBasicBlock *clone() override; BasicBlock *getIRBasicBlock() const { return IRBB; } }; @@ -3732,11 +3715,6 @@ class VPRegionBlock : public VPBlockBase { IsReplicator(IsReplicator) {} ~VPRegionBlock() override { - if (Entry) { - VPValue DummyValue; - Entry->dropAllReferences(&DummyValue); - deleteCFG(Entry); - } } /// Method to support type inquiry through isa, cast, and dyn_cast. @@ -3863,6 +3841,8 @@ class VPlan { /// been modeled in VPlan directly. DenseMap SCEVToExpansion; + SmallVector CreatedBlocks; + public: /// Construct a VPlan with \p Entry to the plan and with \p ScalarHeader /// wrapping the original header of the scalar loop. @@ -4079,6 +4059,32 @@ class VPlan { /// Clone the current VPlan, update all VPValues of the new VPlan and cloned /// recipes to refer to the clones, and return it. VPlan *duplicate(); + + VPBasicBlock *createVPBasicBlock(const Twine &Name, + VPRecipeBase *Recipe = nullptr) { + auto *VPB = new VPBasicBlock(Name, Recipe); + CreatedBlocks.push_back(VPB); + return VPB; + } + + VPRegionBlock *createVPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting, + const std::string &Name = "", + bool IsReplicator = false) { + auto *VPB = new VPRegionBlock(Entry, Exiting, Name, IsReplicator); + CreatedBlocks.push_back(VPB); + return VPB; + } + + VPRegionBlock *createVPRegionBlock(const std::string &Name = "", + bool IsReplicator = false) { + auto *VPB = new VPRegionBlock(Name, IsReplicator); + CreatedBlocks.push_back(VPB); + return VPB; + } + + /// Create a VPIRBasicBlock from \p IRBB containing VPIRInstructions for all + /// instructions in \p IRBB, except its terminator which is managed in VPlan. + VPIRBasicBlock *createVPIRBasicBlock(BasicBlock *IRBB); }; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp index 6e633739fcc3d..02f4c8d8872d8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp @@ -182,7 +182,7 @@ VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) { // Create new VPBB. StringRef Name = isHeaderBB(BB, TheLoop) ? "vector.body" : BB->getName(); LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << Name << "\n"); - VPBasicBlock *VPBB = new VPBasicBlock(Name); + VPBasicBlock *VPBB = Plan.createVPBasicBlock(Name); BB2VPBB[BB] = VPBB; // Get or create a region for the loop containing BB. @@ -204,7 +204,7 @@ VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) { if (LoopOfBB == TheLoop) { RegionOfVPBB = Plan.getVectorLoopRegion(); } else { - RegionOfVPBB = new VPRegionBlock(Name.str(), false /*isReplicator*/); + RegionOfVPBB = Plan.createVPRegionBlock(Name.str(), false /*isReplicator*/); RegionOfVPBB->setParent(Loop2Region[LoopOfBB->getParentLoop()]); } RegionOfVPBB->setEntry(VPBB); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 0b809c2b34df9..cd3ea561e6aac 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -297,8 +297,6 @@ static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) { DeletedRegions.insert(Region1); } - for (VPRegionBlock *ToDelete : DeletedRegions) - delete ToDelete; return !DeletedRegions.empty(); } @@ -310,7 +308,8 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe, assert(Instr->getParent() && "Predicated instruction not in any basic block"); auto *BlockInMask = PredRecipe->getMask(); auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); - auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); + auto *Entry = + Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); // Replace predicated replicate recipe with a replicate recipe without a // mask but in the replicate region. @@ -318,7 +317,8 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe, PredRecipe->getUnderlyingInstr(), make_range(PredRecipe->op_begin(), std::prev(PredRecipe->op_end())), PredRecipe->isUniform()); - auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask); + auto *Pred = + Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask); VPPredInstPHIRecipe *PHIRecipe = nullptr; if (PredRecipe->getNumUsers() != 0) { @@ -328,8 +328,10 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe, PHIRecipe->setOperand(0, RecipeWithoutMask); } PredRecipe->eraseFromParent(); - auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); - VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true); + auto *Exiting = + Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); + VPRegionBlock *Region = + Plan.createVPRegionBlock(Entry, Exiting, RegionName, true); // Note: first set Entry as region entry and then connect successors starting // from it in order, to propagate the "parent" of each VPBasicBlock. @@ -396,7 +398,6 @@ static bool mergeBlocksIntoPredecessors(VPlan &Plan) { VPBlockUtils::disconnectBlocks(VPBB, Succ); VPBlockUtils::connectBlocks(PredVPBB, Succ); } - delete VPBB; } return !WorkList.empty(); } @@ -1898,7 +1899,7 @@ void VPlanTransforms::handleUncountableEarlyExit( if (OrigLoop->getUniqueExitBlock()) { VPEarlyExitBlock = cast(MiddleVPBB->getSuccessors()[0]); } else { - VPEarlyExitBlock = VPIRBasicBlock::fromBasicBlock( + VPEarlyExitBlock = Plan.createVPIRBasicBlock( !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc); } @@ -1908,7 +1909,7 @@ void VPlanTransforms::handleUncountableEarlyExit( IsEarlyExitTaken = Builder.createNaryOp(VPInstruction::AnyOf, {EarlyExitTakenCond}); - VPBasicBlock *NewMiddle = new VPBasicBlock("middle.split"); + VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split"); VPBlockUtils::insertOnEdge(LoopRegion, MiddleVPBB, NewMiddle); VPBlockUtils::connectBlocks(NewMiddle, VPEarlyExitBlock); NewMiddle->swapSuccessors(); From e72a71fa3c31872b72a84cd78ac8b9b1aa719883 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 24 Dec 2024 21:51:48 +0000 Subject: [PATCH 12/25] !fixup address comments, add comments --- llvm/lib/Transforms/Vectorize/VPlan.cpp | 3 ++- llvm/lib/Transforms/Vectorize/VPlan.h | 17 +++++++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index e03847cea131f..204a1e01b9313 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1264,7 +1264,8 @@ VPlan *VPlan::duplicate() { "TripCount must have been added to Old2NewVPValues"); NewPlan->TripCount = Old2NewVPValues[TripCount]; - for (unsigned I = CreatedBlockSize; I != CreatedBlocks.size(); ++I) + // Transfer cloned blocks to new VPlan. + for (unsigned I : seq(CreatedBlockSize, CreatedBlocks.size())) NewPlan->CreatedBlocks.push_back(CreatedBlocks[I]); CreatedBlocks.truncate(CreatedBlockSize); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index eb0e3baa8d4f3..434b4d7e49ab8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3714,8 +3714,7 @@ class VPRegionBlock : public VPBlockBase { : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exiting(nullptr), IsReplicator(IsReplicator) {} - ~VPRegionBlock() override { - } + ~VPRegionBlock() override {} /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPBlockBase *V) { @@ -3841,6 +3840,8 @@ class VPlan { /// been modeled in VPlan directly. DenseMap SCEVToExpansion; + /// Blocks allocated and owned by the VPlan. They will be deleted once the + /// VPlan is destroyed. SmallVector CreatedBlocks; public: @@ -4060,6 +4061,9 @@ class VPlan { /// recipes to refer to the clones, and return it. VPlan *duplicate(); + /// Create a new VPBasicBlock with \p Name and containing \p Recipe if + /// present. The returned block is owned by the VPlan and deleted once the + /// VPlan is destroyed. VPBasicBlock *createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe = nullptr) { auto *VPB = new VPBasicBlock(Name, Recipe); @@ -4067,6 +4071,9 @@ class VPlan { return VPB; } + /// Create a new VPRegionBlock with \p Entry, \p Exiting and \p Name. If \p + /// IsReplicator is true, the region is a replicate region. The returned block + /// is owned by the VPlan and deleted once the VPlan is destroyed. VPRegionBlock *createVPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name = "", bool IsReplicator = false) { @@ -4075,6 +4082,10 @@ class VPlan { return VPB; } + /// Create a new VPRegionBlock with \p Name and entry and exiting blocks set + /// to nullptr. If \p IsReplicator is true, the region is a replicate region. + /// The returned block is owned by the VPlan and deleted once the VPlan is + /// destroyed. VPRegionBlock *createVPRegionBlock(const std::string &Name = "", bool IsReplicator = false) { auto *VPB = new VPRegionBlock(Name, IsReplicator); @@ -4084,6 +4095,8 @@ class VPlan { /// Create a VPIRBasicBlock from \p IRBB containing VPIRInstructions for all /// instructions in \p IRBB, except its terminator which is managed in VPlan. + /// The returned block is owned by the VPlan and deleted once the VPlan is + /// destroyed. VPIRBasicBlock *createVPIRBasicBlock(BasicBlock *IRBB); }; From 407dbc1eccf9e4cd22e1c20f9212fef0b2ead7f4 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 27 Dec 2024 11:25:59 +0000 Subject: [PATCH 13/25] [VPlan] Funnel --- llvm/lib/Transforms/Vectorize/VPlan.h | 14 +- .../Transforms/Vectorize/VPlanHCFGBuilder.h | 4 +- .../Transforms/Vectorize/VPDomTreeTest.cpp | 42 ++-- .../Transforms/Vectorize/VPlanHCFGTest.cpp | 2 +- .../Transforms/Vectorize/VPlanSlpTest.cpp | 2 +- .../Transforms/Vectorize/VPlanTest.cpp | 210 ++++++------------ .../Transforms/Vectorize/VPlanTestBase.h | 20 +- .../Vectorize/VPlanVerifierTest.cpp | 89 +++----- 8 files changed, 139 insertions(+), 244 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 434b4d7e49ab8..beabcc7fd4187 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3854,18 +3854,18 @@ class VPlan { "scalar header must be a leaf node"); } - /// Construct a VPlan with \p Entry entering the plan, trip count \p TC and - /// with \p ScalarHeader wrapping the original header of the scalar loop. - VPlan(VPBasicBlock *Entry, VPValue *TC, VPIRBasicBlock *ScalarHeader) - : VPlan(Entry, ScalarHeader) { - TripCount = TC; - } - +public: /// Construct a VPlan for \p L. This will create VPIRBasicBlocks wrapping the /// original preheader and scalar header of \p L, to be used as entry and /// scalar header blocks of the new VPlan. VPlan(Loop *L); + VPlan(BasicBlock *ScalarHeaderBB, VPValue *TC) { + setEntry(new VPBasicBlock("preheader")); + ScalarHeader = VPIRBasicBlock::fromBasicBlock(ScalarHeaderBB); + TripCount = TC; + } + ~VPlan(); void setEntry(VPBasicBlock *VPBB) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h index 9e8f9f3f40029..ad6e2ad90a961 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h @@ -32,11 +32,11 @@ class Loop; class LoopInfo; class VPRegionBlock; class VPlan; -class VPlanTestBase; +class VPlanTestIRBase; /// Main class to build the VPlan H-CFG for an incoming IR. class VPlanHCFGBuilder { - friend VPlanTestBase; + friend VPlanTestIRBase; private: // The outermost loop of the input loop nest considered for vectorization. diff --git a/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp b/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp index 847cca7714eff..6aa34a5fa431b 100644 --- a/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp @@ -9,12 +9,15 @@ #include "../lib/Transforms/Vectorize/VPlan.h" #include "../lib/Transforms/Vectorize/VPlanDominatorTree.h" +#include "VPlanTestBase.h" #include "gtest/gtest.h" namespace llvm { namespace { -TEST(VPDominatorTreeTest, DominanceNoRegionsTest) { +using VPDominatorTreeTest = VPlanTestBase; + +TEST_F(VPDominatorTreeTest, DominanceNoRegionsTest) { // VPBB0 // | // R1 { @@ -24,8 +27,8 @@ TEST(VPDominatorTreeTest, DominanceNoRegionsTest) { // \ / // VPBB4 // } - VPBasicBlock *VPPH = new VPBasicBlock("ph"); - VPBasicBlock *VPBB0 = new VPBasicBlock("VPBB0"); + VPlan &Plan = getPlan(); + VPBasicBlock *VPBB0 = Plan.getEntry(); VPBasicBlock *VPBB1 = new VPBasicBlock("VPBB1"); VPBasicBlock *VPBB2 = new VPBasicBlock("VPBB2"); VPBasicBlock *VPBB3 = new VPBasicBlock("VPBB3"); @@ -40,12 +43,7 @@ TEST(VPDominatorTreeTest, DominanceNoRegionsTest) { VPBlockUtils::connectBlocks(VPBB2, VPBB4); VPBlockUtils::connectBlocks(VPBB3, VPBB4); - LLVMContext C; - auto *ScalarHeader = BasicBlock::Create(C, ""); - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB0); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader()); VPDominatorTree VPDT; VPDT.recalculate(Plan); @@ -62,7 +60,6 @@ TEST(VPDominatorTreeTest, DominanceNoRegionsTest) { EXPECT_EQ(VPDT.findNearestCommonDominator(VPBB2, VPBB3), VPBB1); EXPECT_EQ(VPDT.findNearestCommonDominator(VPBB2, VPBB4), VPBB1); EXPECT_EQ(VPDT.findNearestCommonDominator(VPBB4, VPBB4), VPBB4); - delete ScalarHeader; } static void @@ -76,9 +73,7 @@ checkDomChildren(VPDominatorTree &VPDT, VPBlockBase *Src, EXPECT_EQ(Children, ExpectedNodes); } -TEST(VPDominatorTreeTest, DominanceRegionsTest) { - LLVMContext C; - auto *ScalarHeader = BasicBlock::Create(C, ""); +TEST_F(VPDominatorTreeTest, DominanceRegionsTest) { { // 2 consecutive regions. // VPBB0 @@ -99,8 +94,8 @@ TEST(VPDominatorTreeTest, DominanceRegionsTest) { // R2BB2 // } // - VPBasicBlock *VPPH = new VPBasicBlock("ph"); - VPBasicBlock *VPBB0 = new VPBasicBlock("VPBB0"); + VPlan &Plan = getPlan(); + VPBasicBlock *VPBB0 = Plan.getEntry(); VPBasicBlock *R1BB1 = new VPBasicBlock(); VPBasicBlock *R1BB2 = new VPBasicBlock(); VPBasicBlock *R1BB3 = new VPBasicBlock(); @@ -122,10 +117,7 @@ TEST(VPDominatorTreeTest, DominanceRegionsTest) { VPBlockUtils::connectBlocks(R2BB1, R2BB2); VPBlockUtils::connectBlocks(R1, R2); - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(R2, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB0); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(R2, Plan.getScalarHeader()); VPDominatorTree VPDT; VPDT.recalculate(Plan); @@ -177,7 +169,7 @@ TEST(VPDominatorTreeTest, DominanceRegionsTest) { // | // VPBB2 // - VPBasicBlock *VPPH = new VPBasicBlock("ph"); + VPlan &Plan = getPlan(); VPBasicBlock *R1BB1 = new VPBasicBlock("R1BB1"); VPBasicBlock *R1BB2 = new VPBasicBlock("R1BB2"); VPBasicBlock *R1BB3 = new VPBasicBlock("R1BB3"); @@ -199,15 +191,12 @@ TEST(VPDominatorTreeTest, DominanceRegionsTest) { VPBlockUtils::connectBlocks(R1BB2, R1BB3); VPBlockUtils::connectBlocks(R2, R1BB3); - VPBasicBlock *VPBB1 = new VPBasicBlock("VPBB1"); + VPBasicBlock *VPBB1 = Plan.getEntry(); VPBlockUtils::connectBlocks(VPBB1, R1); VPBasicBlock *VPBB2 = new VPBasicBlock("VPBB2"); VPBlockUtils::connectBlocks(R1, VPBB2); - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(VPBB2, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB1); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(VPBB2, Plan.getScalarHeader()); VPDominatorTree VPDT; VPDT.recalculate(Plan); @@ -220,9 +209,8 @@ TEST(VPDominatorTreeTest, DominanceRegionsTest) { checkDomChildren(VPDT, R2BB2, {R2BB3}); checkDomChildren(VPDT, R2BB3, {}); checkDomChildren(VPDT, R1BB3, {VPBB2}); - checkDomChildren(VPDT, VPBB2, {ScalarHeaderVPBB}); + checkDomChildren(VPDT, VPBB2, {Plan.getScalarHeader()}); } - delete ScalarHeader; } } // namespace diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp index 1b362d1d26bdd..19c2483d34ed1 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp @@ -17,7 +17,7 @@ namespace llvm { namespace { -class VPlanHCFGTest : public VPlanTestBase {}; +class VPlanHCFGTest : public VPlanTestIRBase {}; TEST_F(VPlanHCFGTest, testBuildHCFGInnerLoop) { const char *ModuleString = diff --git a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp index 1b993b63898ca..e3c542ec5cac8 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp @@ -16,7 +16,7 @@ namespace llvm { namespace { -class VPlanSlpTest : public VPlanTestBase { +class VPlanSlpTest : public VPlanTestIRBase { protected: TargetLibraryInfoImpl TLII; TargetLibraryInfo TLI; diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index f3a1bba518c83..2ab55f64a2073 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -9,6 +9,7 @@ #include "../lib/Transforms/Vectorize/VPlan.h" #include "../lib/Transforms/Vectorize/VPlanCFG.h" +#include "VPlanTestBase.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/Analysis/VectorUtils.h" @@ -237,12 +238,13 @@ TEST(VPInstructionTest, releaseOperandsAtDeletion) { delete VPV1; delete VPV2; } -TEST(VPBasicBlockTest, getPlan) { - LLVMContext C; - auto *ScalarHeader = BasicBlock::Create(C, ""); + +using VPBasicBlockTest = VPlanTestBase; + +TEST_F(VPBasicBlockTest, getPlan) { { - VPBasicBlock *VPPH = new VPBasicBlock("ph"); - VPBasicBlock *VPBB1 = new VPBasicBlock(); + VPlan &Plan = getPlan(); + VPBasicBlock *VPBB1 = Plan.getEntry(); VPBasicBlock *VPBB2 = new VPBasicBlock(); VPBasicBlock *VPBB3 = new VPBasicBlock(); VPBasicBlock *VPBB4 = new VPBasicBlock(); @@ -256,11 +258,7 @@ TEST(VPBasicBlockTest, getPlan) { VPBlockUtils::connectBlocks(VPBB1, VPBB3); VPBlockUtils::connectBlocks(VPBB2, VPBB4); VPBlockUtils::connectBlocks(VPBB3, VPBB4); - - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(VPBB4, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB1); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(VPBB4, Plan.getScalarHeader()); EXPECT_EQ(&Plan, VPBB1->getPlan()); EXPECT_EQ(&Plan, VPBB2->getPlan()); @@ -269,20 +267,17 @@ TEST(VPBasicBlockTest, getPlan) { } { - VPBasicBlock *VPPH = new VPBasicBlock("ph"); + VPlan &Plan = getPlan(); + VPBasicBlock *VPBB1 = Plan.getEntry(); // VPBasicBlock is the entry into the VPlan, followed by a region. VPBasicBlock *R1BB1 = new VPBasicBlock(); VPBasicBlock *R1BB2 = new VPBasicBlock(); VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB2, "R1"); VPBlockUtils::connectBlocks(R1BB1, R1BB2); - VPBasicBlock *VPBB1 = new VPBasicBlock(); VPBlockUtils::connectBlocks(VPBB1, R1); - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB1); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader()); EXPECT_EQ(&Plan, VPBB1->getPlan()); EXPECT_EQ(&Plan, R1->getPlan()); @@ -291,8 +286,7 @@ TEST(VPBasicBlockTest, getPlan) { } { - VPBasicBlock *VPPH = new VPBasicBlock("ph"); - + VPlan &Plan = getPlan(); VPBasicBlock *R1BB1 = new VPBasicBlock(); VPBasicBlock *R1BB2 = new VPBasicBlock(); VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB2, "R1"); @@ -303,7 +297,7 @@ TEST(VPBasicBlockTest, getPlan) { VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB2, "R2"); VPBlockUtils::connectBlocks(R2BB1, R2BB2); - VPBasicBlock *VPBB1 = new VPBasicBlock(); + VPBasicBlock *VPBB1 = Plan.getEntry(); VPBlockUtils::connectBlocks(VPBB1, R1); VPBlockUtils::connectBlocks(VPBB1, R2); @@ -311,10 +305,7 @@ TEST(VPBasicBlockTest, getPlan) { VPBlockUtils::connectBlocks(R1, VPBB2); VPBlockUtils::connectBlocks(R2, VPBB2); - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(R2, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB1); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(R2, Plan.getScalarHeader()); EXPECT_EQ(&Plan, VPBB1->getPlan()); EXPECT_EQ(&Plan, R1->getPlan()); @@ -325,12 +316,9 @@ TEST(VPBasicBlockTest, getPlan) { EXPECT_EQ(&Plan, R2BB2->getPlan()); EXPECT_EQ(&Plan, VPBB2->getPlan()); } - delete ScalarHeader; } -TEST(VPBasicBlockTest, TraversingIteratorTest) { - LLVMContext C; - auto *ScalarHeader = BasicBlock::Create(C, ""); +TEST_F(VPBasicBlockTest, TraversingIteratorTest) { { // VPBasicBlocks only // VPBB1 @@ -339,8 +327,8 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { // \ / // VPBB4 // - VPBasicBlock *VPPH = new VPBasicBlock("ph"); - VPBasicBlock *VPBB1 = new VPBasicBlock(); + VPlan &Plan = getPlan(); + VPBasicBlock *VPBB1 = Plan.getEntry(); VPBasicBlock *VPBB2 = new VPBasicBlock(); VPBasicBlock *VPBB3 = new VPBasicBlock(); VPBasicBlock *VPBB4 = new VPBasicBlock(); @@ -356,11 +344,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { EXPECT_EQ(VPBB1, FromIterator[0]); EXPECT_EQ(VPBB2, FromIterator[1]); - // Use Plan to properly clean up created blocks. - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(VPBB4, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB1); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(VPBB4, Plan.getScalarHeader()); } { @@ -382,8 +366,8 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { // | // R2BB2 // - VPBasicBlock *VPPH = new VPBasicBlock("ph"); - VPBasicBlock *VPBB0 = new VPBasicBlock("VPBB0"); + VPlan &Plan = getPlan(); + VPBasicBlock *VPBB0 = Plan.getEntry(); VPBasicBlock *R1BB1 = new VPBasicBlock(); VPBasicBlock *R1BB2 = new VPBasicBlock(); VPBasicBlock *R1BB3 = new VPBasicBlock(); @@ -458,11 +442,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { EXPECT_EQ(R1BB1, FromIterator[6]); EXPECT_EQ(R1, FromIterator[7]); - // Use Plan to properly clean up created blocks. - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(R2, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB0); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(R2, Plan.getScalarHeader()); } { @@ -486,7 +466,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { // | // VPBB2 // - VPBasicBlock *VPPH = new VPBasicBlock("ph"); + VPlan &Plan = getPlan(); VPBasicBlock *R1BB1 = new VPBasicBlock("R1BB1"); VPBasicBlock *R1BB2 = new VPBasicBlock("R1BB2"); VPBasicBlock *R1BB3 = new VPBasicBlock("R1BB3"); @@ -508,7 +488,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { VPBlockUtils::connectBlocks(R1BB2, R1BB3); VPBlockUtils::connectBlocks(R2, R1BB3); - VPBasicBlock *VPBB1 = new VPBasicBlock("VPBB1"); + VPBasicBlock *VPBB1 = Plan.getEntry(); VPBlockUtils::connectBlocks(VPBB1, R1); VPBasicBlock *VPBB2 = new VPBasicBlock("VPBB2"); VPBlockUtils::connectBlocks(R1, VPBB2); @@ -543,11 +523,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { EXPECT_EQ(R1, FromIterator[8]); EXPECT_EQ(VPBB1, FromIterator[9]); - // Use Plan to properly clean up created blocks. - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(VPBB2, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB1); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(VPBB2, Plan.getScalarHeader()); } { @@ -561,7 +537,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { // R2BB2 // } // - VPBasicBlock *VPPH = new VPBasicBlock("ph"); + VPlan &Plan = getPlan(); VPBasicBlock *R2BB1 = new VPBasicBlock("R2BB1"); VPBasicBlock *R2BB2 = new VPBasicBlock("R2BB2"); VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB2, "R2"); @@ -570,7 +546,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { VPRegionBlock *R1 = new VPRegionBlock(R2, R2, "R1"); R2->setParent(R1); - VPBasicBlock *VPBB1 = new VPBasicBlock("VPBB1"); + VPBasicBlock *VPBB1 = Plan.getEntry(); VPBlockUtils::connectBlocks(VPBB1, R1); // Depth-first. @@ -593,11 +569,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { EXPECT_EQ(R1, FromIterator[3]); EXPECT_EQ(VPBB1, FromIterator[4]); - // Use Plan to properly clean up created blocks. - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB1); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader()); } { @@ -619,7 +591,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { // | // VPBB2 // - VPBasicBlock *VPPH = new VPBasicBlock("ph"); + VPlan &Plan = getPlan(); VPBasicBlock *R3BB1 = new VPBasicBlock("R3BB1"); VPRegionBlock *R3 = new VPRegionBlock(R3BB1, R3BB1, "R3"); @@ -631,7 +603,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { VPRegionBlock *R1 = new VPRegionBlock(R2, R2, "R1"); R2->setParent(R1); - VPBasicBlock *VPBB1 = new VPBasicBlock("VPBB1"); + VPBasicBlock *VPBB1 = Plan.getEntry(); VPBasicBlock *VPBB2 = new VPBasicBlock("VPBB2"); VPBlockUtils::connectBlocks(VPBB1, R1); VPBlockUtils::connectBlocks(R1, VPBB2); @@ -687,19 +659,15 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { EXPECT_EQ(R2BB1, FromIterator[2]); EXPECT_EQ(VPBB1, FromIterator[3]); - // Use Plan to properly clean up created blocks. - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(VPBB2, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB1); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(VPBB2, Plan.getScalarHeader()); } - delete ScalarHeader; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -TEST(VPBasicBlockTest, print) { +TEST_F(VPBasicBlockTest, print) { VPInstruction *TC = new VPInstruction(Instruction::Add, {}); - VPBasicBlock *VPBB0 = new VPBasicBlock("preheader"); + VPlan &Plan = getPlan(TC); + VPBasicBlock *VPBB0 = Plan.getEntry(); VPBB0->appendRecipe(TC); VPInstruction *I1 = new VPInstruction(Instruction::Add, {}); @@ -730,12 +698,8 @@ TEST(VPBasicBlockTest, print) { EXPECT_EQ("EMIT br , ", I3Dump); } - LLVMContext C; - auto *ScalarHeader = BasicBlock::Create(C, "scalar.header"); - auto * ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(VPBB2, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(VPBB2, Plan.getScalarHeader()); VPBlockUtils::connectBlocks(VPBB0, VPBB1); - VPlan Plan(VPBB0, TC, ScalarHeaderVPBB); std::string FullDump; raw_string_ostream OS(FullDump); Plan.printDOT(OS); @@ -810,13 +774,12 @@ Successor(s): ir-bb OS << *I4; EXPECT_EQ("EMIT vp<%5> = mul vp<%3>, vp<%2>", I4Dump); } - delete ScalarHeader; } -TEST(VPBasicBlockTest, printPlanWithVFsAndUFs) { - +TEST_F(VPBasicBlockTest, printPlanWithVFsAndUFs) { VPInstruction *TC = new VPInstruction(Instruction::Sub, {}); - VPBasicBlock *VPBB0 = new VPBasicBlock("preheader"); + VPlan &Plan = getPlan(TC); + VPBasicBlock *VPBB0 = Plan.getEntry(); VPBB0->appendRecipe(TC); VPInstruction *I1 = new VPInstruction(Instruction::Add, {}); @@ -824,12 +787,8 @@ TEST(VPBasicBlockTest, printPlanWithVFsAndUFs) { VPBB1->appendRecipe(I1); VPBB1->setName("bb1"); - LLVMContext C; - auto *ScalarHeader = BasicBlock::Create(C, ""); - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(VPBB1, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(VPBB1, Plan.getScalarHeader()); VPBlockUtils::connectBlocks(VPBB0, VPBB1); - VPlan Plan(VPBB0, TC, ScalarHeaderVPBB); Plan.setName("TestPlan"); Plan.addVF(ElementCount::getFixed(4)); @@ -847,9 +806,9 @@ Successor(s): bb1 bb1: EMIT vp<%2> = add -Successor(s): ir-bb<> +Successor(s): ir-bb -ir-bb<>: +ir-bb: No successors } )"; @@ -871,9 +830,9 @@ Successor(s): bb1 bb1: EMIT vp<%2> = add -Successor(s): ir-bb<> +Successor(s): ir-bb -ir-bb<>: +ir-bb: No successors } )"; @@ -895,19 +854,19 @@ Successor(s): bb1 bb1: EMIT vp<%2> = add -Successor(s): ir-bb<> +Successor(s): ir-bb -ir-bb<>: +ir-bb: No successors } )"; EXPECT_EQ(ExpectedStr, FullDump); } - delete ScalarHeader; } #endif -TEST(VPRecipeTest, CastVPInstructionToVPUser) { +using VPRecipeTest = VPlanTestBase; +TEST_F(VPRecipeTest, CastVPInstructionToVPUser) { VPValue Op1; VPValue Op2; VPInstruction Recipe(Instruction::Add, {&Op1, &Op2}); @@ -917,9 +876,7 @@ TEST(VPRecipeTest, CastVPInstructionToVPUser) { EXPECT_EQ(&Recipe, BaseR); } -TEST(VPRecipeTest, CastVPWidenRecipeToVPUser) { - LLVMContext C; - +TEST_F(VPRecipeTest, CastVPWidenRecipeToVPUser) { IntegerType *Int32 = IntegerType::get(C, 32); auto *AI = BinaryOperator::CreateAdd(PoisonValue::get(Int32), PoisonValue::get(Int32)); @@ -936,9 +893,7 @@ TEST(VPRecipeTest, CastVPWidenRecipeToVPUser) { delete AI; } -TEST(VPRecipeTest, CastVPWidenCallRecipeToVPUserAndVPDef) { - LLVMContext C; - +TEST_F(VPRecipeTest, CastVPWidenCallRecipeToVPUserAndVPDef) { IntegerType *Int32 = IntegerType::get(C, 32); FunctionType *FTy = FunctionType::get(Int32, false); Function *Fn = Function::Create(FTy, GlobalValue::ExternalLinkage, 0); @@ -964,9 +919,7 @@ TEST(VPRecipeTest, CastVPWidenCallRecipeToVPUserAndVPDef) { delete Fn; } -TEST(VPRecipeTest, CastVPWidenSelectRecipeToVPUserAndVPDef) { - LLVMContext C; - +TEST_F(VPRecipeTest, CastVPWidenSelectRecipeToVPUserAndVPDef) { IntegerType *Int1 = IntegerType::get(C, 1); IntegerType *Int32 = IntegerType::get(C, 32); auto *SelectI = SelectInst::Create( @@ -992,9 +945,7 @@ TEST(VPRecipeTest, CastVPWidenSelectRecipeToVPUserAndVPDef) { delete SelectI; } -TEST(VPRecipeTest, CastVPWidenGEPRecipeToVPUserAndVPDef) { - LLVMContext C; - +TEST_F(VPRecipeTest, CastVPWidenGEPRecipeToVPUserAndVPDef) { IntegerType *Int32 = IntegerType::get(C, 32); PointerType *Int32Ptr = PointerType::get(Int32, 0); auto *GEP = GetElementPtrInst::Create(Int32, PoisonValue::get(Int32Ptr), @@ -1017,9 +968,7 @@ TEST(VPRecipeTest, CastVPWidenGEPRecipeToVPUserAndVPDef) { delete GEP; } -TEST(VPRecipeTest, CastVPBlendRecipeToVPUser) { - LLVMContext C; - +TEST_F(VPRecipeTest, CastVPBlendRecipeToVPUser) { IntegerType *Int32 = IntegerType::get(C, 32); auto *Phi = PHINode::Create(Int32, 1); VPValue I1; @@ -1036,9 +985,7 @@ TEST(VPRecipeTest, CastVPBlendRecipeToVPUser) { delete Phi; } -TEST(VPRecipeTest, CastVPInterleaveRecipeToVPUser) { - LLVMContext C; - +TEST_F(VPRecipeTest, CastVPInterleaveRecipeToVPUser) { VPValue Addr; VPValue Mask; InterleaveGroup IG(4, false, Align(4)); @@ -1049,9 +996,7 @@ TEST(VPRecipeTest, CastVPInterleaveRecipeToVPUser) { EXPECT_EQ(&Recipe, BaseR); } -TEST(VPRecipeTest, CastVPReplicateRecipeToVPUser) { - LLVMContext C; - +TEST_F(VPRecipeTest, CastVPReplicateRecipeToVPUser) { VPValue Op1; VPValue Op2; SmallVector Args; @@ -1068,9 +1013,7 @@ TEST(VPRecipeTest, CastVPReplicateRecipeToVPUser) { delete Call; } -TEST(VPRecipeTest, CastVPBranchOnMaskRecipeToVPUser) { - LLVMContext C; - +TEST_F(VPRecipeTest, CastVPBranchOnMaskRecipeToVPUser) { VPValue Mask; VPBranchOnMaskRecipe Recipe(&Mask); EXPECT_TRUE(isa(&Recipe)); @@ -1079,9 +1022,7 @@ TEST(VPRecipeTest, CastVPBranchOnMaskRecipeToVPUser) { EXPECT_EQ(&Recipe, BaseR); } -TEST(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) { - LLVMContext C; - +TEST_F(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) { IntegerType *Int32 = IntegerType::get(C, 32); PointerType *Int32Ptr = PointerType::get(Int32, 0); auto *Load = @@ -1101,8 +1042,7 @@ TEST(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) { delete Load; } -TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { - LLVMContext C; +TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { IntegerType *Int1 = IntegerType::get(C, 1); IntegerType *Int32 = IntegerType::get(C, 32); PointerType *Int32Ptr = PointerType::get(Int32, 0); @@ -1242,7 +1182,6 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { { // Test for a call to a function without side-effects. - LLVMContext C; Module M("", C); Function *TheFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::thread_pointer); @@ -1296,15 +1235,12 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -TEST(VPRecipeTest, dumpRecipeInPlan) { - VPBasicBlock *VPBB0 = new VPBasicBlock("preheader"); +TEST_F(VPRecipeTest, dumpRecipeInPlan) { + VPlan &Plan = getPlan(); + VPBasicBlock *VPBB0 = Plan.getEntry(); VPBasicBlock *VPBB1 = new VPBasicBlock(); - LLVMContext C; - auto *ScalarHeader = BasicBlock::Create(C, ""); - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(VPBB1, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(VPBB1, Plan.getScalarHeader()); VPBlockUtils::connectBlocks(VPBB0, VPBB1); - VPlan Plan(VPBB0, ScalarHeaderVPBB); IntegerType *Int32 = IntegerType::get(C, 32); auto *AI = BinaryOperator::CreateAdd(PoisonValue::get(Int32), @@ -1366,18 +1302,14 @@ TEST(VPRecipeTest, dumpRecipeInPlan) { } delete AI; - delete ScalarHeader; } -TEST(VPRecipeTest, dumpRecipeUnnamedVPValuesInPlan) { - VPBasicBlock *VPBB0 = new VPBasicBlock("preheader"); +TEST_F(VPRecipeTest, dumpRecipeUnnamedVPValuesInPlan) { + VPlan &Plan = getPlan(); + VPBasicBlock *VPBB0 = Plan.getEntry(); VPBasicBlock *VPBB1 = new VPBasicBlock(); - LLVMContext C; - auto *ScalarHeader = BasicBlock::Create(C, ""); - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(VPBB1, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(VPBB1, Plan.getScalarHeader()); VPBlockUtils::connectBlocks(VPBB0, VPBB1); - VPlan Plan(VPBB0, ScalarHeaderVPBB); IntegerType *Int32 = IntegerType::get(C, 32); auto *AI = BinaryOperator::CreateAdd(PoisonValue::get(Int32), @@ -1456,11 +1388,9 @@ TEST(VPRecipeTest, dumpRecipeUnnamedVPValuesInPlan) { testing::ExitedWithCode(0), "EMIT vp<%2> = mul vp<%1>, vp<%1>"); } delete AI; - delete ScalarHeader; } -TEST(VPRecipeTest, dumpRecipeUnnamedVPValuesNotInPlanOrBlock) { - LLVMContext C; +TEST_F(VPRecipeTest, dumpRecipeUnnamedVPValuesNotInPlanOrBlock) { IntegerType *Int32 = IntegerType::get(C, 32); auto *AI = BinaryOperator::CreateAdd(PoisonValue::get(Int32), PoisonValue::get(Int32)); @@ -1543,9 +1473,7 @@ TEST(VPRecipeTest, dumpRecipeUnnamedVPValuesNotInPlanOrBlock) { #endif -TEST(VPRecipeTest, CastVPReductionRecipeToVPUser) { - LLVMContext C; - +TEST_F(VPRecipeTest, CastVPReductionRecipeToVPUser) { VPValue ChainOp; VPValue VecOp; VPValue CondOp; @@ -1556,9 +1484,7 @@ TEST(VPRecipeTest, CastVPReductionRecipeToVPUser) { EXPECT_TRUE(isa(BaseR)); } -TEST(VPRecipeTest, CastVPReductionEVLRecipeToVPUser) { - LLVMContext C; - +TEST_F(VPRecipeTest, CastVPReductionEVLRecipeToVPUser) { VPValue ChainOp; VPValue VecOp; VPValue CondOp; @@ -1630,7 +1556,7 @@ TEST(VPDoubleValueDefTest, traverseUseLists) { EXPECT_EQ(&DoubleValueDef, I3.getOperand(0)->getDefiningRecipe()); } -TEST(VPRecipeTest, CastToVPSingleDefRecipe) { +TEST_F(VPRecipeTest, CastToVPSingleDefRecipe) { VPValue Start; VPEVLBasedIVPHIRecipe R(&Start, {}); VPRecipeBase *B = &R; diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h index 06e091da9054e..1836a5e39a290 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h +++ b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h @@ -28,7 +28,7 @@ namespace llvm { /// Helper class to create a module from an assembly string and VPlans for a /// given loop entry block. -class VPlanTestBase : public testing::Test { +class VPlanTestIRBase : public testing::Test { protected: TargetLibraryInfoImpl TLII; TargetLibraryInfo TLI; @@ -41,7 +41,7 @@ class VPlanTestBase : public testing::Test { std::unique_ptr AC; std::unique_ptr SE; - VPlanTestBase() + VPlanTestIRBase() : TLII(), TLI(TLII), DL("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-" "f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:" @@ -92,6 +92,22 @@ class VPlanTestBase : public testing::Test { } }; +class VPlanTestBase : public testing::Test { +protected: + LLVMContext C; + std::unique_ptr ScalarHeader; + SmallVector> Plans; + + VPlanTestBase() : ScalarHeader(BasicBlock::Create(C, "scalar.header")) { + BranchInst::Create(&*ScalarHeader, &*ScalarHeader); + } + + VPlan &getPlan(VPValue *TC = nullptr) { + Plans.push_back(std::make_unique(&*ScalarHeader, TC)); + return *Plans.back(); + } +}; + } // namespace llvm #endif // LLVM_UNITTESTS_TRANSFORMS_VECTORIZE_VPLANTESTBASE_H diff --git a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp index 6448153de7821..174249a7e85e3 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp @@ -8,32 +8,29 @@ #include "../lib/Transforms/Vectorize/VPlanVerifier.h" #include "../lib/Transforms/Vectorize/VPlan.h" +#include "VPlanTestBase.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "gtest/gtest.h" using namespace llvm; +using VPVerifierTest = VPlanTestBase; + namespace { -TEST(VPVerifierTest, VPInstructionUseBeforeDefSameBB) { +TEST_F(VPVerifierTest, VPInstructionUseBeforeDefSameBB) { + VPlan &Plan = getPlan(); VPInstruction *DefI = new VPInstruction(Instruction::Add, {}); VPInstruction *UseI = new VPInstruction(Instruction::Sub, {DefI}); - VPBasicBlock *VPPH = new VPBasicBlock("ph"); - VPBasicBlock *VPBB1 = new VPBasicBlock(); + VPBasicBlock *VPBB1 = Plan.getEntry(); VPBB1->appendRecipe(UseI); VPBB1->appendRecipe(DefI); VPBasicBlock *VPBB2 = new VPBasicBlock(); VPRegionBlock *R1 = new VPRegionBlock(VPBB2, VPBB2, "R1"); VPBlockUtils::connectBlocks(VPBB1, R1); - - LLVMContext C; - auto *ScalarHeader = BasicBlock::Create(C, ""); - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB1); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader()); #if GTEST_HAS_STREAM_REDIRECTION ::testing::internal::CaptureStderr(); @@ -43,18 +40,17 @@ TEST(VPVerifierTest, VPInstructionUseBeforeDefSameBB) { EXPECT_STREQ("Use before def!\n", ::testing::internal::GetCapturedStderr().c_str()); #endif - delete ScalarHeader; } -TEST(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) { +TEST_F(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) { + VPlan &Plan = getPlan(); VPInstruction *DefI = new VPInstruction(Instruction::Add, {}); VPInstruction *UseI = new VPInstruction(Instruction::Sub, {DefI}); auto *CanIV = new VPCanonicalIVPHIRecipe(UseI, {}); VPInstruction *BranchOnCond = new VPInstruction(VPInstruction::BranchOnCond, {CanIV}); - VPBasicBlock *VPPH = new VPBasicBlock("ph"); - VPBasicBlock *VPBB1 = new VPBasicBlock(); + VPBasicBlock *VPBB1 = Plan.getEntry(); VPBasicBlock *VPBB2 = new VPBasicBlock(); VPBB1->appendRecipe(UseI); @@ -64,13 +60,7 @@ TEST(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) { VPRegionBlock *R1 = new VPRegionBlock(VPBB2, VPBB2, "R1"); VPBlockUtils::connectBlocks(VPBB1, R1); - - LLVMContext C; - auto *ScalarHeader = BasicBlock::Create(C, ""); - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB1); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader()); #if GTEST_HAS_STREAM_REDIRECTION ::testing::internal::CaptureStderr(); @@ -80,11 +70,9 @@ TEST(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) { EXPECT_STREQ("Use before def!\n", ::testing::internal::GetCapturedStderr().c_str()); #endif - delete ScalarHeader; } -TEST(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) { - LLVMContext C; +TEST_F(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) { IntegerType *Int32 = IntegerType::get(C, 32); auto *Phi = PHINode::Create(Int32, 1); @@ -95,8 +83,8 @@ TEST(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) { new VPInstruction(VPInstruction::BranchOnCond, {CanIV}); auto *Blend = new VPBlendRecipe(Phi, {DefI}); - VPBasicBlock *VPPH = new VPBasicBlock("ph"); - VPBasicBlock *VPBB1 = new VPBasicBlock(); + VPlan &Plan = getPlan(); + VPBasicBlock *VPBB1 = Plan.getEntry(); VPBasicBlock *VPBB2 = new VPBasicBlock(); VPBasicBlock *VPBB3 = new VPBasicBlock(); VPBasicBlock *VPBB4 = new VPBasicBlock(); @@ -113,11 +101,7 @@ TEST(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) { VPBlockUtils::connectBlocks(VPBB1, R1); VPBB3->setParent(R1); - auto *ScalarHeader = BasicBlock::Create(C, ""); - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB1); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader()); #if GTEST_HAS_STREAM_REDIRECTION ::testing::internal::CaptureStderr(); @@ -129,10 +113,9 @@ TEST(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) { #endif delete Phi; - delete ScalarHeader; } -TEST(VPVerifierTest, DuplicateSuccessorsOutsideRegion) { +TEST_F(VPVerifierTest, DuplicateSuccessorsOutsideRegion) { VPInstruction *I1 = new VPInstruction(Instruction::Add, {}); auto *CanIV = new VPCanonicalIVPHIRecipe(I1, {}); VPInstruction *BranchOnCond = @@ -140,8 +123,8 @@ TEST(VPVerifierTest, DuplicateSuccessorsOutsideRegion) { VPInstruction *BranchOnCond2 = new VPInstruction(VPInstruction::BranchOnCond, {I1}); - VPBasicBlock *VPPH = new VPBasicBlock("ph"); - VPBasicBlock *VPBB1 = new VPBasicBlock(); + VPlan &Plan = getPlan(); + VPBasicBlock *VPBB1 = Plan.getEntry(); VPBasicBlock *VPBB2 = new VPBasicBlock(); VPBB1->appendRecipe(I1); @@ -153,12 +136,7 @@ TEST(VPVerifierTest, DuplicateSuccessorsOutsideRegion) { VPBlockUtils::connectBlocks(VPBB1, R1); VPBlockUtils::connectBlocks(VPBB1, R1); - LLVMContext C; - auto *ScalarHeader = BasicBlock::Create(C, ""); - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB1); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader()); #if GTEST_HAS_STREAM_REDIRECTION ::testing::internal::CaptureStderr(); @@ -168,10 +146,9 @@ TEST(VPVerifierTest, DuplicateSuccessorsOutsideRegion) { EXPECT_STREQ("Multiple instances of the same successor.\n", ::testing::internal::GetCapturedStderr().c_str()); #endif - delete ScalarHeader; } -TEST(VPVerifierTest, DuplicateSuccessorsInsideRegion) { +TEST_F(VPVerifierTest, DuplicateSuccessorsInsideRegion) { VPInstruction *I1 = new VPInstruction(Instruction::Add, {}); auto *CanIV = new VPCanonicalIVPHIRecipe(I1, {}); VPInstruction *BranchOnCond = @@ -179,8 +156,8 @@ TEST(VPVerifierTest, DuplicateSuccessorsInsideRegion) { VPInstruction *BranchOnCond2 = new VPInstruction(VPInstruction::BranchOnCond, {I1}); - VPBasicBlock *VPPH = new VPBasicBlock("ph"); - VPBasicBlock *VPBB1 = new VPBasicBlock(); + VPlan &Plan = getPlan(); + VPBasicBlock *VPBB1 = Plan.getEntry(); VPBasicBlock *VPBB2 = new VPBasicBlock(); VPBasicBlock *VPBB3 = new VPBasicBlock(); @@ -195,12 +172,7 @@ TEST(VPVerifierTest, DuplicateSuccessorsInsideRegion) { VPBlockUtils::connectBlocks(VPBB1, R1); VPBB3->setParent(R1); - LLVMContext C; - auto *ScalarHeader = BasicBlock::Create(C, ""); - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB1); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader()); #if GTEST_HAS_STREAM_REDIRECTION ::testing::internal::CaptureStderr(); @@ -210,12 +182,11 @@ TEST(VPVerifierTest, DuplicateSuccessorsInsideRegion) { EXPECT_STREQ("Multiple instances of the same successor.\n", ::testing::internal::GetCapturedStderr().c_str()); #endif - delete ScalarHeader; } -TEST(VPVerifierTest, BlockOutsideRegionWithParent) { - VPBasicBlock *VPPH = new VPBasicBlock("ph"); - VPBasicBlock *VPBB1 = new VPBasicBlock(); +TEST_F(VPVerifierTest, BlockOutsideRegionWithParent) { + VPlan &Plan = getPlan(); + VPBasicBlock *VPBB1 = Plan.getEntry(); VPBasicBlock *VPBB2 = new VPBasicBlock(); VPInstruction *DefI = new VPInstruction(Instruction::Add, {}); @@ -228,12 +199,7 @@ TEST(VPVerifierTest, BlockOutsideRegionWithParent) { VPRegionBlock *R1 = new VPRegionBlock(VPBB2, VPBB2, "R1"); VPBlockUtils::connectBlocks(VPBB1, R1); - LLVMContext C; - auto *ScalarHeader = BasicBlock::Create(C, ""); - VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); - VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB); - VPBlockUtils::connectBlocks(VPPH, VPBB1); - VPlan Plan(VPPH, ScalarHeaderVPBB); + VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader()); VPBB1->setParent(R1); #if GTEST_HAS_STREAM_REDIRECTION @@ -244,7 +210,6 @@ TEST(VPVerifierTest, BlockOutsideRegionWithParent) { EXPECT_STREQ("Predecessor is not in the same region.\n", ::testing::internal::GetCapturedStderr().c_str()); #endif - delete ScalarHeader; } } // namespace From af48fccd3d35851f25890f7844aebf2c57a566cc Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 27 Dec 2024 11:25:59 +0000 Subject: [PATCH 14/25] [VPlan] Funnel --- llvm/lib/Transforms/Vectorize/VPlan.h | 5 +- .../Transforms/Vectorize/VPlanHCFGBuilder.cpp | 6 +- .../Transforms/Vectorize/VPDomTreeTest.cpp | 35 +++++---- .../Transforms/Vectorize/VPlanTest.cpp | 77 ++++++++++--------- .../Vectorize/VPlanVerifierTest.cpp | 18 ++--- 5 files changed, 72 insertions(+), 69 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index beabcc7fd4187..f235ed37e2c9b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3844,7 +3844,6 @@ class VPlan { /// VPlan is destroyed. SmallVector CreatedBlocks; -public: /// Construct a VPlan with \p Entry to the plan and with \p ScalarHeader /// wrapping the original header of the scalar loop. VPlan(VPBasicBlock *Entry, VPIRBasicBlock *ScalarHeader) @@ -3861,8 +3860,8 @@ class VPlan { VPlan(Loop *L); VPlan(BasicBlock *ScalarHeaderBB, VPValue *TC) { - setEntry(new VPBasicBlock("preheader")); - ScalarHeader = VPIRBasicBlock::fromBasicBlock(ScalarHeaderBB); + setEntry(createVPBasicBlock("preheader")); + ScalarHeader = createVPIRBasicBlock(ScalarHeaderBB); TripCount = TC; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp index 02f4c8d8872d8..76ed578424dfe 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp @@ -357,12 +357,10 @@ void PlainCFGBuilder::buildPlainCFG() { BB2VPBB[TheLoop->getHeader()] = VectorHeaderVPBB; VectorHeaderVPBB->clearSuccessors(); VectorLatchVPBB->clearPredecessors(); - if (TheLoop->getHeader() != TheLoop->getLoopLatch()) { + if (TheLoop->getHeader() != TheLoop->getLoopLatch()) BB2VPBB[TheLoop->getLoopLatch()] = VectorLatchVPBB; - } else { + else TheRegion->setExiting(VectorHeaderVPBB); - delete VectorLatchVPBB; - } // 1. Scan the body of the loop in a topological order to visit each basic // block after having visited its predecessor basic blocks. Create a VPBB for diff --git a/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp b/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp index 6aa34a5fa431b..4e1415fa7ac13 100644 --- a/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp @@ -29,10 +29,10 @@ TEST_F(VPDominatorTreeTest, DominanceNoRegionsTest) { // } VPlan &Plan = getPlan(); VPBasicBlock *VPBB0 = Plan.getEntry(); - VPBasicBlock *VPBB1 = new VPBasicBlock("VPBB1"); - VPBasicBlock *VPBB2 = new VPBasicBlock("VPBB2"); - VPBasicBlock *VPBB3 = new VPBasicBlock("VPBB3"); - VPBasicBlock *VPBB4 = new VPBasicBlock("VPBB4"); + VPBasicBlock *VPBB1 = Plan.createVPBasicBlock("VPBB1"); + VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("VPBB2"); + VPBasicBlock *VPBB3 = Plan.createVPBasicBlock("VPBB3"); + VPBasicBlock *VPBB4 = Plan.createVPBasicBlock("VPBB4"); VPRegionBlock *R1 = new VPRegionBlock(VPBB1, VPBB4); VPBB2->setParent(R1); VPBB3->setParent(R1); @@ -96,10 +96,10 @@ TEST_F(VPDominatorTreeTest, DominanceRegionsTest) { // VPlan &Plan = getPlan(); VPBasicBlock *VPBB0 = Plan.getEntry(); - VPBasicBlock *R1BB1 = new VPBasicBlock(); - VPBasicBlock *R1BB2 = new VPBasicBlock(); - VPBasicBlock *R1BB3 = new VPBasicBlock(); - VPBasicBlock *R1BB4 = new VPBasicBlock(); + VPBasicBlock *R1BB1 = Plan.createVPBasicBlock(""); + VPBasicBlock *R1BB2 = Plan.createVPBasicBlock(""); + VPBasicBlock *R1BB3 = Plan.createVPBasicBlock(""); + VPBasicBlock *R1BB4 = Plan.createVPBasicBlock(""); VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB4, "R1"); R1BB2->setParent(R1); R1BB3->setParent(R1); @@ -111,8 +111,8 @@ TEST_F(VPDominatorTreeTest, DominanceRegionsTest) { // Cycle. VPBlockUtils::connectBlocks(R1BB3, R1BB3); - VPBasicBlock *R2BB1 = new VPBasicBlock(); - VPBasicBlock *R2BB2 = new VPBasicBlock(); + VPBasicBlock *R2BB1 = Plan.createVPBasicBlock(""); + VPBasicBlock *R2BB2 = Plan.createVPBasicBlock(""); VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB2, "R2"); VPBlockUtils::connectBlocks(R2BB1, R2BB2); VPBlockUtils::connectBlocks(R1, R2); @@ -170,14 +170,14 @@ TEST_F(VPDominatorTreeTest, DominanceRegionsTest) { // VPBB2 // VPlan &Plan = getPlan(); - VPBasicBlock *R1BB1 = new VPBasicBlock("R1BB1"); - VPBasicBlock *R1BB2 = new VPBasicBlock("R1BB2"); - VPBasicBlock *R1BB3 = new VPBasicBlock("R1BB3"); + VPBasicBlock *R1BB1 = Plan.createVPBasicBlock("R1BB1"); + VPBasicBlock *R1BB2 = Plan.createVPBasicBlock("R1BB2"); + VPBasicBlock *R1BB3 = Plan.createVPBasicBlock("R1BB3"); VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB3, "R1"); - VPBasicBlock *R2BB1 = new VPBasicBlock("R2BB1"); - VPBasicBlock *R2BB2 = new VPBasicBlock("R2BB2"); - VPBasicBlock *R2BB3 = new VPBasicBlock("R2BB3"); + VPBasicBlock *R2BB1 = Plan.createVPBasicBlock(""); + VPBasicBlock *R2BB2 = Plan.createVPBasicBlock(""); + VPBasicBlock *R2BB3 = Plan.createVPBasicBlock(""); VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB3, "R2"); R2BB2->setParent(R2); VPBlockUtils::connectBlocks(R2BB1, R2BB2); @@ -193,7 +193,8 @@ TEST_F(VPDominatorTreeTest, DominanceRegionsTest) { VPBasicBlock *VPBB1 = Plan.getEntry(); VPBlockUtils::connectBlocks(VPBB1, R1); - VPBasicBlock *VPBB2 = new VPBasicBlock("VPBB2"); + VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("" + "VPBB2"); VPBlockUtils::connectBlocks(R1, VPBB2); VPBlockUtils::connectBlocks(VPBB2, Plan.getScalarHeader()); diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index 2ab55f64a2073..6928303499131 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -245,9 +245,9 @@ TEST_F(VPBasicBlockTest, getPlan) { { VPlan &Plan = getPlan(); VPBasicBlock *VPBB1 = Plan.getEntry(); - VPBasicBlock *VPBB2 = new VPBasicBlock(); - VPBasicBlock *VPBB3 = new VPBasicBlock(); - VPBasicBlock *VPBB4 = new VPBasicBlock(); + VPBasicBlock *VPBB2 = Plan.createVPBasicBlock(""); + VPBasicBlock *VPBB3 = Plan.createVPBasicBlock(""); + VPBasicBlock *VPBB4 = Plan.createVPBasicBlock(""); // VPBB1 // / \ @@ -270,8 +270,8 @@ TEST_F(VPBasicBlockTest, getPlan) { VPlan &Plan = getPlan(); VPBasicBlock *VPBB1 = Plan.getEntry(); // VPBasicBlock is the entry into the VPlan, followed by a region. - VPBasicBlock *R1BB1 = new VPBasicBlock(); - VPBasicBlock *R1BB2 = new VPBasicBlock(); + VPBasicBlock *R1BB1 = Plan.createVPBasicBlock(""); + VPBasicBlock *R1BB2 = Plan.createVPBasicBlock(""); VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB2, "R1"); VPBlockUtils::connectBlocks(R1BB1, R1BB2); @@ -287,13 +287,13 @@ TEST_F(VPBasicBlockTest, getPlan) { { VPlan &Plan = getPlan(); - VPBasicBlock *R1BB1 = new VPBasicBlock(); - VPBasicBlock *R1BB2 = new VPBasicBlock(); + VPBasicBlock *R1BB1 = Plan.createVPBasicBlock(""); + VPBasicBlock *R1BB2 = Plan.createVPBasicBlock(""); VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB2, "R1"); VPBlockUtils::connectBlocks(R1BB1, R1BB2); - VPBasicBlock *R2BB1 = new VPBasicBlock(); - VPBasicBlock *R2BB2 = new VPBasicBlock(); + VPBasicBlock *R2BB1 = Plan.createVPBasicBlock(""); + VPBasicBlock *R2BB2 = Plan.createVPBasicBlock(""); VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB2, "R2"); VPBlockUtils::connectBlocks(R2BB1, R2BB2); @@ -301,7 +301,7 @@ TEST_F(VPBasicBlockTest, getPlan) { VPBlockUtils::connectBlocks(VPBB1, R1); VPBlockUtils::connectBlocks(VPBB1, R2); - VPBasicBlock *VPBB2 = new VPBasicBlock(); + VPBasicBlock *VPBB2 = Plan.createVPBasicBlock(""); VPBlockUtils::connectBlocks(R1, VPBB2); VPBlockUtils::connectBlocks(R2, VPBB2); @@ -329,9 +329,9 @@ TEST_F(VPBasicBlockTest, TraversingIteratorTest) { // VPlan &Plan = getPlan(); VPBasicBlock *VPBB1 = Plan.getEntry(); - VPBasicBlock *VPBB2 = new VPBasicBlock(); - VPBasicBlock *VPBB3 = new VPBasicBlock(); - VPBasicBlock *VPBB4 = new VPBasicBlock(); + VPBasicBlock *VPBB2 = Plan.createVPBasicBlock(""); + VPBasicBlock *VPBB3 = Plan.createVPBasicBlock(""); + VPBasicBlock *VPBB4 = Plan.createVPBasicBlock(""); VPBlockUtils::connectBlocks(VPBB1, VPBB2); VPBlockUtils::connectBlocks(VPBB1, VPBB3); @@ -368,10 +368,10 @@ TEST_F(VPBasicBlockTest, TraversingIteratorTest) { // VPlan &Plan = getPlan(); VPBasicBlock *VPBB0 = Plan.getEntry(); - VPBasicBlock *R1BB1 = new VPBasicBlock(); - VPBasicBlock *R1BB2 = new VPBasicBlock(); - VPBasicBlock *R1BB3 = new VPBasicBlock(); - VPBasicBlock *R1BB4 = new VPBasicBlock(); + VPBasicBlock *R1BB1 = Plan.createVPBasicBlock(""); + VPBasicBlock *R1BB2 = Plan.createVPBasicBlock(""); + VPBasicBlock *R1BB3 = Plan.createVPBasicBlock(""); + VPBasicBlock *R1BB4 = Plan.createVPBasicBlock(""); VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB4, "R1"); R1BB2->setParent(R1); R1BB3->setParent(R1); @@ -383,8 +383,8 @@ TEST_F(VPBasicBlockTest, TraversingIteratorTest) { // Cycle. VPBlockUtils::connectBlocks(R1BB3, R1BB3); - VPBasicBlock *R2BB1 = new VPBasicBlock(); - VPBasicBlock *R2BB2 = new VPBasicBlock(); + VPBasicBlock *R2BB1 = Plan.createVPBasicBlock(""); + VPBasicBlock *R2BB2 = Plan.createVPBasicBlock(""); VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB2, "R2"); VPBlockUtils::connectBlocks(R2BB1, R2BB2); VPBlockUtils::connectBlocks(R1, R2); @@ -467,14 +467,17 @@ TEST_F(VPBasicBlockTest, TraversingIteratorTest) { // VPBB2 // VPlan &Plan = getPlan(); - VPBasicBlock *R1BB1 = new VPBasicBlock("R1BB1"); - VPBasicBlock *R1BB2 = new VPBasicBlock("R1BB2"); - VPBasicBlock *R1BB3 = new VPBasicBlock("R1BB3"); + VPBasicBlock *R1BB1 = Plan.createVPBasicBlock("R1BB1"); + VPBasicBlock *R1BB2 = Plan.createVPBasicBlock("R1BB2"); + VPBasicBlock *R1BB3 = Plan.createVPBasicBlock("R1BB3"); VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB3, "R1"); - VPBasicBlock *R2BB1 = new VPBasicBlock("R2BB1"); - VPBasicBlock *R2BB2 = new VPBasicBlock("R2BB2"); - VPBasicBlock *R2BB3 = new VPBasicBlock("R2BB3"); + VPBasicBlock *R2BB1 = Plan.createVPBasicBlock("" + "R2BB1"); + VPBasicBlock *R2BB2 = Plan.createVPBasicBlock("" + "R2BB2"); + VPBasicBlock *R2BB3 = Plan.createVPBasicBlock("" + "R2BB3"); VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB3, "R2"); R2BB2->setParent(R2); VPBlockUtils::connectBlocks(R2BB1, R2BB2); @@ -490,7 +493,8 @@ TEST_F(VPBasicBlockTest, TraversingIteratorTest) { VPBasicBlock *VPBB1 = Plan.getEntry(); VPBlockUtils::connectBlocks(VPBB1, R1); - VPBasicBlock *VPBB2 = new VPBasicBlock("VPBB2"); + VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("" + "VPBB2"); VPBlockUtils::connectBlocks(R1, VPBB2); // Depth-first. @@ -538,8 +542,8 @@ TEST_F(VPBasicBlockTest, TraversingIteratorTest) { // } // VPlan &Plan = getPlan(); - VPBasicBlock *R2BB1 = new VPBasicBlock("R2BB1"); - VPBasicBlock *R2BB2 = new VPBasicBlock("R2BB2"); + VPBasicBlock *R2BB1 = Plan.createVPBasicBlock("R2BB1"); + VPBasicBlock *R2BB2 = Plan.createVPBasicBlock("R2BB2"); VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB2, "R2"); VPBlockUtils::connectBlocks(R2BB1, R2BB2); @@ -592,10 +596,11 @@ TEST_F(VPBasicBlockTest, TraversingIteratorTest) { // VPBB2 // VPlan &Plan = getPlan(); - VPBasicBlock *R3BB1 = new VPBasicBlock("R3BB1"); + VPBasicBlock *R3BB1 = Plan.createVPBasicBlock("R3BB1"); VPRegionBlock *R3 = new VPRegionBlock(R3BB1, R3BB1, "R3"); - VPBasicBlock *R2BB1 = new VPBasicBlock("R2BB1"); + VPBasicBlock *R2BB1 = Plan.createVPBasicBlock("" + "R2BB1"); VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R3, "R2"); R3->setParent(R2); VPBlockUtils::connectBlocks(R2BB1, R3); @@ -604,7 +609,7 @@ TEST_F(VPBasicBlockTest, TraversingIteratorTest) { R2->setParent(R1); VPBasicBlock *VPBB1 = Plan.getEntry(); - VPBasicBlock *VPBB2 = new VPBasicBlock("VPBB2"); + VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("VPBB2"); VPBlockUtils::connectBlocks(VPBB1, R1); VPBlockUtils::connectBlocks(R1, VPBB2); @@ -674,7 +679,7 @@ TEST_F(VPBasicBlockTest, print) { VPInstruction *I2 = new VPInstruction(Instruction::Sub, {I1}); VPInstruction *I3 = new VPInstruction(Instruction::Br, {I1, I2}); - VPBasicBlock *VPBB1 = new VPBasicBlock(); + VPBasicBlock *VPBB1 = Plan.createVPBasicBlock(""); VPBB1->appendRecipe(I1); VPBB1->appendRecipe(I2); VPBB1->appendRecipe(I3); @@ -682,7 +687,7 @@ TEST_F(VPBasicBlockTest, print) { VPInstruction *I4 = new VPInstruction(Instruction::Mul, {I2, I1}); VPInstruction *I5 = new VPInstruction(Instruction::Ret, {I4}); - VPBasicBlock *VPBB2 = new VPBasicBlock(); + VPBasicBlock *VPBB2 = Plan.createVPBasicBlock(""); VPBB2->appendRecipe(I4); VPBB2->appendRecipe(I5); VPBB2->setName("bb2"); @@ -783,7 +788,7 @@ TEST_F(VPBasicBlockTest, printPlanWithVFsAndUFs) { VPBB0->appendRecipe(TC); VPInstruction *I1 = new VPInstruction(Instruction::Add, {}); - VPBasicBlock *VPBB1 = new VPBasicBlock(); + VPBasicBlock *VPBB1 = Plan.createVPBasicBlock(""); VPBB1->appendRecipe(I1); VPBB1->setName("bb1"); @@ -1238,7 +1243,7 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { TEST_F(VPRecipeTest, dumpRecipeInPlan) { VPlan &Plan = getPlan(); VPBasicBlock *VPBB0 = Plan.getEntry(); - VPBasicBlock *VPBB1 = new VPBasicBlock(); + VPBasicBlock *VPBB1 = Plan.createVPBasicBlock(""); VPBlockUtils::connectBlocks(VPBB1, Plan.getScalarHeader()); VPBlockUtils::connectBlocks(VPBB0, VPBB1); @@ -1307,7 +1312,7 @@ TEST_F(VPRecipeTest, dumpRecipeInPlan) { TEST_F(VPRecipeTest, dumpRecipeUnnamedVPValuesInPlan) { VPlan &Plan = getPlan(); VPBasicBlock *VPBB0 = Plan.getEntry(); - VPBasicBlock *VPBB1 = new VPBasicBlock(); + VPBasicBlock *VPBB1 = Plan.createVPBasicBlock(""); VPBlockUtils::connectBlocks(VPBB1, Plan.getScalarHeader()); VPBlockUtils::connectBlocks(VPBB0, VPBB1); diff --git a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp index 174249a7e85e3..5a29e7ac0893b 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp @@ -27,7 +27,7 @@ TEST_F(VPVerifierTest, VPInstructionUseBeforeDefSameBB) { VPBB1->appendRecipe(UseI); VPBB1->appendRecipe(DefI); - VPBasicBlock *VPBB2 = new VPBasicBlock(); + VPBasicBlock *VPBB2 = Plan.createVPBasicBlock(""); VPRegionBlock *R1 = new VPRegionBlock(VPBB2, VPBB2, "R1"); VPBlockUtils::connectBlocks(VPBB1, R1); VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader()); @@ -51,7 +51,7 @@ TEST_F(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) { new VPInstruction(VPInstruction::BranchOnCond, {CanIV}); VPBasicBlock *VPBB1 = Plan.getEntry(); - VPBasicBlock *VPBB2 = new VPBasicBlock(); + VPBasicBlock *VPBB2 = Plan.createVPBasicBlock(""); VPBB1->appendRecipe(UseI); VPBB2->appendRecipe(CanIV); @@ -85,9 +85,9 @@ TEST_F(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) { VPlan &Plan = getPlan(); VPBasicBlock *VPBB1 = Plan.getEntry(); - VPBasicBlock *VPBB2 = new VPBasicBlock(); - VPBasicBlock *VPBB3 = new VPBasicBlock(); - VPBasicBlock *VPBB4 = new VPBasicBlock(); + VPBasicBlock *VPBB2 = Plan.createVPBasicBlock(""); + VPBasicBlock *VPBB3 = Plan.createVPBasicBlock(""); + VPBasicBlock *VPBB4 = Plan.createVPBasicBlock(""); VPBB1->appendRecipe(I1); VPBB2->appendRecipe(CanIV); @@ -125,7 +125,7 @@ TEST_F(VPVerifierTest, DuplicateSuccessorsOutsideRegion) { VPlan &Plan = getPlan(); VPBasicBlock *VPBB1 = Plan.getEntry(); - VPBasicBlock *VPBB2 = new VPBasicBlock(); + VPBasicBlock *VPBB2 = Plan.createVPBasicBlock(""); VPBB1->appendRecipe(I1); VPBB1->appendRecipe(BranchOnCond2); @@ -158,8 +158,8 @@ TEST_F(VPVerifierTest, DuplicateSuccessorsInsideRegion) { VPlan &Plan = getPlan(); VPBasicBlock *VPBB1 = Plan.getEntry(); - VPBasicBlock *VPBB2 = new VPBasicBlock(); - VPBasicBlock *VPBB3 = new VPBasicBlock(); + VPBasicBlock *VPBB2 = Plan.createVPBasicBlock(""); + VPBasicBlock *VPBB3 = Plan.createVPBasicBlock(""); VPBB1->appendRecipe(I1); VPBB2->appendRecipe(CanIV); @@ -187,7 +187,7 @@ TEST_F(VPVerifierTest, DuplicateSuccessorsInsideRegion) { TEST_F(VPVerifierTest, BlockOutsideRegionWithParent) { VPlan &Plan = getPlan(); VPBasicBlock *VPBB1 = Plan.getEntry(); - VPBasicBlock *VPBB2 = new VPBasicBlock(); + VPBasicBlock *VPBB2 = Plan.createVPBasicBlock(""); VPInstruction *DefI = new VPInstruction(Instruction::Add, {}); VPInstruction *BranchOnCond = From 56a41810be9d062ee2fb204952431c88fa9422dc Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 31 Dec 2024 13:18:44 +0000 Subject: [PATCH 15/25] !fixup update after merge --- llvm/lib/Transforms/Vectorize/VPlan.cpp | 7 ---- .../Transforms/Vectorize/VPlanTransforms.cpp | 32 +++++++++---------- .../version-stride-with-integer-casts.ll | 3 +- .../vplan-printing-before-execute.ll | 2 +- 4 files changed, 17 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index c2dcb77ce81a9..96c0da20ee3b8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -521,13 +521,6 @@ VPBasicBlock *VPBasicBlock::clone() { return NewBlock; } -VPBasicBlock *VPBasicBlock::clone() { - auto *NewBlock = getPlan()->createVPBasicBlock(getName()); - for (VPRecipeBase &R : *this) - NewBlock->appendRecipe(R.clone()); - return NewBlock; -} - void VPBasicBlock::executeRecipes(VPTransformState *State, BasicBlock *BB) { LLVM_DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName() << " in BB:" << BB->getName() << '\n'); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 8c481774987e6..06edb4a5f8e98 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -667,8 +667,8 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, PredicatedScalarEvolution &PSE) { assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan"); assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan"); - VPBasicBlock *ExitingVPBB = - Plan.getVectorLoopRegion()->getExitingBasicBlock(); + VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); + VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock(); auto *Term = &ExitingVPBB->back(); // Try to simplify the branch condition if TC <= VF * UF when preparing to // execute the plan for the main vector loop. We only do this if the @@ -692,9 +692,9 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C)) return; - SmallVector PossiblyDead(Term->operands()); Term->eraseFromParent(); - auto *Header = cast(Plan.getVectorLoopRegion()->getEntry()); + auto *Header = cast(VectorRegion->getEntry()); + auto *CanIVTy = Plan.getCanonicalIV()->getScalarType(); if (any_of(Header->phis(), IsaPred)) { LLVMContext &Ctx = SE.getContext(); @@ -709,27 +709,25 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, P->eraseFromParent(); } - VPBlockBase *Preheader = Plan.getVectorLoopRegion()->getSinglePredecessor(); - auto *Exiting = - cast(Plan.getVectorLoopRegion()->getExiting()); - - auto *LoopRegion = Plan.getVectorLoopRegion(); - VPBlockBase *Middle = LoopRegion->getSingleSuccessor(); - VPBlockUtils::disconnectBlocks(Preheader, LoopRegion); - VPBlockUtils::disconnectBlocks(LoopRegion, Middle); + VPBlockBase *Preheader = Plan.getVectorPreheader(); + VPBlockBase *Middle = Plan.getMiddleBlock(); + VPBlockUtils::disconnectBlocks(Preheader, VectorRegion); + VPBlockUtils::disconnectBlocks(VectorRegion, Middle); Header->setParent(nullptr); - Exiting->setParent(nullptr); + ExitingVPBB->setParent(nullptr); - for (VPBlockBase *B : vp_depth_first_shallow(LoopRegion->getEntry())) { + for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry())) { if (isa(B)) B->setParent(nullptr); } VPBlockUtils::connectBlocks(Preheader, Header); - VPBlockUtils::connectBlocks(Exiting, Middle); + VPBlockUtils::connectBlocks(ExitingVPBB, Middle); + simplifyRecipes(Plan, CanIVTy); } - for (VPValue *Op : PossiblyDead) - recursivelyDeleteDeadRecipes(Op); + + VPlanTransforms::removeDeadRecipes(Plan); + Plan.setVF(BestVF); Plan.setUF(BestUF); // TODO: Further simplifications are possible diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll index 55eab88950504..791c995d88c14 100644 --- a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll +++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll @@ -499,9 +499,8 @@ define void @sext_of_i1_stride(i1 %g, ptr %dst) mustprogress { ; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], [[G_64]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 0, [[G_64]] ; CHECK-NEXT: [[TMP8:%.*]] = mul i64 0, [[G_64]] -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP3]], [[TMP8]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 0, [[TMP8]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 -3 diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll index d047d1da5ef73..beb305f23884e 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll @@ -74,7 +74,7 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { ; CHECK-NEXT: IR %n.vec = sub i64 %and, %n.mod.vf ; CHECK-NEXT: vp<[[END1:%.+]]> = DERIVED-IV ir<%and> + ir<[[VTC]]> * ir<-1> ; CHECK-NEXT: vp<[[END2:%.+]]> = DERIVED-IV ir<%A> + ir<[[VTC]]> * ir<1> -; CHECK-NEXT: Successor(s): vector loop +; CHECK-NEXT: Successor(s): vector.body ; CHECK-EMPTY: ; CHECK-NEXT: vector.body: ; CHECK-NEXT: vp<[[STEPS1:%.+]]> = SCALAR-STEPS ir<0>, ir<1> From 303ce930d626804869027a48369346712a1f7bb9 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 31 Dec 2024 17:42:42 +0000 Subject: [PATCH 16/25] !fixup adjust assert --- llvm/lib/Transforms/Vectorize/VPlan.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 96c0da20ee3b8..34ca03aa5cc75 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -555,7 +555,7 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) { template static T *getEnclosingLoopRegionForRegion(T *P) { if (P && P->isReplicator()) { P = P->getParent(); - assert(!cast(P)->isReplicator() && + assert((!P || !cast(P)->isReplicator()) && "unexpected nested replicate regions"); } return P; From f9db2d0c10fad26f1af3e996e1a06138c65e599b Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 31 Dec 2024 20:14:45 +0000 Subject: [PATCH 17/25] !fixup update extra tests. --- .../vector-loop-backedge-elimination.ll | 608 ++++++++++-------- 1 file changed, 329 insertions(+), 279 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll index 6bf8c4c4ebb8a..3de42080b1842 100644 --- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll +++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll @@ -8,7 +8,131 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" ; Check if the vector loop condition can be simplified to true for a given ; VF/IC combination. define void @test_tc_less_than_16(ptr %A, i64 %N) { +; VF8UF1-LABEL: define void @test_tc_less_than_16( +; VF8UF1-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) { +; VF8UF1-NEXT: [[ENTRY:.*]]: +; VF8UF1-NEXT: [[AND:%.*]] = and i64 [[N]], 15 +; VF8UF1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[AND]], 8 +; VF8UF1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; VF8UF1: [[VECTOR_PH]]: +; VF8UF1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[AND]], 8 +; VF8UF1-NEXT: [[N_VEC:%.*]] = sub i64 [[AND]], [[N_MOD_VF]] +; VF8UF1-NEXT: [[TMP0:%.*]] = sub i64 [[AND]], [[N_VEC]] +; VF8UF1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]] +; VF8UF1-NEXT: br label %[[VECTOR_BODY:.*]] +; VF8UF1: [[VECTOR_BODY]]: +; VF8UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF8UF1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; VF8UF1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; VF8UF1-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 +; VF8UF1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP3]], align 1 +; VF8UF1-NEXT: [[TMP4:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10) +; VF8UF1-NEXT: store <8 x i8> [[TMP4]], ptr [[TMP3]], align 1 +; VF8UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; VF8UF1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VF8UF1-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VF8UF1: [[MIDDLE_BLOCK]]: +; VF8UF1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[AND]], [[N_VEC]] +; VF8UF1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; VF8UF1: [[SCALAR_PH]]: +; VF8UF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[AND]], %[[ENTRY]] ] +; VF8UF1-NEXT: [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ] +; VF8UF1-NEXT: br label %[[LOOP:.*]] +; VF8UF1: [[LOOP]]: +; VF8UF1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; VF8UF1-NEXT: [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ] +; VF8UF1-NEXT: [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1 +; VF8UF1-NEXT: [[L:%.*]] = load i8, ptr [[P_SRC]], align 1 +; VF8UF1-NEXT: [[ADD:%.*]] = add nsw i8 [[L]], 10 +; VF8UF1-NEXT: store i8 [[ADD]], ptr [[P_SRC]], align 1 +; VF8UF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 +; VF8UF1-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; VF8UF1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; VF8UF1: [[EXIT]]: +; VF8UF1-NEXT: ret void +; +; VF8UF2-LABEL: define void @test_tc_less_than_16( +; VF8UF2-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) { +; VF8UF2-NEXT: [[ENTRY:.*]]: +; VF8UF2-NEXT: [[AND:%.*]] = and i64 [[N]], 15 +; VF8UF2-NEXT: br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; VF8UF2: [[VECTOR_PH]]: +; VF8UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[AND]], 16 +; VF8UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[AND]], [[N_MOD_VF]] +; VF8UF2-NEXT: [[TMP0:%.*]] = sub i64 [[AND]], [[N_VEC]] +; VF8UF2-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]] +; VF8UF2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF8UF2: [[VECTOR_BODY]]: +; VF8UF2-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 0 +; VF8UF2-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 +; VF8UF2-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 8 +; VF8UF2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 +; VF8UF2-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP3]], align 1 +; VF8UF2-NEXT: [[TMP4:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10) +; VF8UF2-NEXT: [[TMP5:%.*]] = add nsw <8 x i8> [[WIDE_LOAD1]], splat (i8 10) +; VF8UF2-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 +; VF8UF2-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 8 +; VF8UF2-NEXT: store <8 x i8> [[TMP4]], ptr [[TMP6]], align 1 +; VF8UF2-NEXT: store <8 x i8> [[TMP5]], ptr [[TMP7]], align 1 +; VF8UF2-NEXT: br label %[[MIDDLE_BLOCK:.*]] +; VF8UF2: [[MIDDLE_BLOCK]]: +; VF8UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[AND]], [[N_VEC]] +; VF8UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; VF8UF2: [[SCALAR_PH]]: +; VF8UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[AND]], %[[ENTRY]] ] +; VF8UF2-NEXT: [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ] +; VF8UF2-NEXT: br label %[[LOOP:.*]] +; VF8UF2: [[LOOP]]: +; VF8UF2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; VF8UF2-NEXT: [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ] +; VF8UF2-NEXT: [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1 +; VF8UF2-NEXT: [[L:%.*]] = load i8, ptr [[P_SRC]], align 1 +; VF8UF2-NEXT: [[ADD:%.*]] = add nsw i8 [[L]], 10 +; VF8UF2-NEXT: store i8 [[ADD]], ptr [[P_SRC]], align 1 +; VF8UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 +; VF8UF2-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; VF8UF2-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] +; VF8UF2: [[EXIT]]: +; VF8UF2-NEXT: ret void ; +; VF16UF1-LABEL: define void @test_tc_less_than_16( +; VF16UF1-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) { +; VF16UF1-NEXT: [[ENTRY:.*]]: +; VF16UF1-NEXT: [[AND:%.*]] = and i64 [[N]], 15 +; VF16UF1-NEXT: br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; VF16UF1: [[VECTOR_PH]]: +; VF16UF1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[AND]], 16 +; VF16UF1-NEXT: [[N_VEC:%.*]] = sub i64 [[AND]], [[N_MOD_VF]] +; VF16UF1-NEXT: [[TMP0:%.*]] = sub i64 [[AND]], [[N_VEC]] +; VF16UF1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]] +; VF16UF1-NEXT: br label %[[VECTOR_BODY:.*]] +; VF16UF1: [[VECTOR_BODY]]: +; VF16UF1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 0 +; VF16UF1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 +; VF16UF1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; VF16UF1-NEXT: [[TMP3:%.*]] = add nsw <16 x i8> [[WIDE_LOAD]], splat (i8 10) +; VF16UF1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 +; VF16UF1-NEXT: store <16 x i8> [[TMP3]], ptr [[TMP4]], align 1 +; VF16UF1-NEXT: br label %[[MIDDLE_BLOCK:.*]] +; VF16UF1: [[MIDDLE_BLOCK]]: +; VF16UF1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[AND]], [[N_VEC]] +; VF16UF1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; VF16UF1: [[SCALAR_PH]]: +; VF16UF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[AND]], %[[ENTRY]] ] +; VF16UF1-NEXT: [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ] +; VF16UF1-NEXT: br label %[[LOOP:.*]] +; VF16UF1: [[LOOP]]: +; VF16UF1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; VF16UF1-NEXT: [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ] +; VF16UF1-NEXT: [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1 +; VF16UF1-NEXT: [[L:%.*]] = load i8, ptr [[P_SRC]], align 1 +; VF16UF1-NEXT: [[ADD:%.*]] = add nsw i8 [[L]], 10 +; VF16UF1-NEXT: store i8 [[ADD]], ptr [[P_SRC]], align 1 +; VF16UF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 +; VF16UF1-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; VF16UF1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] +; VF16UF1: [[EXIT]]: +; VF16UF1-NEXT: ret void ; entry: %and = and i64 %N, 15 @@ -41,82 +165,68 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, ; VF8UF1-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; VF8UF1-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1 ; VF8UF1-NEXT: [[TMP1:%.*]] = add i64 2, [[N_VEC]] -; VF8UF1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; VF8UF1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer +; VF8UF1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; VF8UF1-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer ; VF8UF1-NEXT: br label %[[VECTOR_BODY:.*]] ; VF8UF1: [[VECTOR_BODY]]: -; VF8UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE16:.*]] ] -; VF8UF1-NEXT: [[OFFSET_IDX:%.*]] = add i64 2, [[INDEX]] -; VF8UF1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i64 0 -; VF8UF1-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer -; VF8UF1-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT1]], -; VF8UF1-NEXT: [[TMP2:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] +; VF8UF1-NEXT: [[TMP2:%.*]] = icmp ule <8 x i64> , [[BROADCAST_SPLAT1]] ; VF8UF1-NEXT: [[TMP3:%.*]] = extractelement <8 x i1> [[TMP2]], i32 0 ; VF8UF1-NEXT: br i1 [[TMP3]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; VF8UF1: [[PRED_STORE_IF]]: -; VF8UF1-NEXT: [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 0 -; VF8UF1-NEXT: [[TMP4:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP20]] +; VF8UF1-NEXT: [[TMP4:%.*]] = getelementptr i16, ptr [[DST]], i64 2 ; VF8UF1-NEXT: store i16 0, ptr [[TMP4]], align 2 ; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE]] ; VF8UF1: [[PRED_STORE_CONTINUE]]: ; VF8UF1-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP2]], i32 1 -; VF8UF1-NEXT: br i1 [[TMP5]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] -; VF8UF1: [[PRED_STORE_IF3]]: -; VF8UF1-NEXT: [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 1 -; VF8UF1-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP21]] +; VF8UF1-NEXT: br i1 [[TMP5]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]] +; VF8UF1: [[PRED_STORE_IF1]]: +; VF8UF1-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[DST]], i64 3 ; VF8UF1-NEXT: store i16 0, ptr [[TMP6]], align 2 +; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE2]] +; VF8UF1: [[PRED_STORE_CONTINUE2]]: +; VF8UF1-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP2]], i32 2 +; VF8UF1-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] +; VF8UF1: [[PRED_STORE_IF3]]: +; VF8UF1-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 4 +; VF8UF1-NEXT: store i16 0, ptr [[TMP8]], align 2 ; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE4]] ; VF8UF1: [[PRED_STORE_CONTINUE4]]: -; VF8UF1-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP2]], i32 2 -; VF8UF1-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]] +; VF8UF1-NEXT: [[TMP9:%.*]] = extractelement <8 x i1> [[TMP2]], i32 3 +; VF8UF1-NEXT: br i1 [[TMP9]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]] ; VF8UF1: [[PRED_STORE_IF5]]: -; VF8UF1-NEXT: [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 2 -; VF8UF1-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP23]] -; VF8UF1-NEXT: store i16 0, ptr [[TMP8]], align 2 +; VF8UF1-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 5 +; VF8UF1-NEXT: store i16 0, ptr [[TMP10]], align 2 ; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE6]] ; VF8UF1: [[PRED_STORE_CONTINUE6]]: -; VF8UF1-NEXT: [[TMP9:%.*]] = extractelement <8 x i1> [[TMP2]], i32 3 -; VF8UF1-NEXT: br i1 [[TMP9]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] +; VF8UF1-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP2]], i32 4 +; VF8UF1-NEXT: br i1 [[TMP11]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] ; VF8UF1: [[PRED_STORE_IF7]]: -; VF8UF1-NEXT: [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 3 -; VF8UF1-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP24]] -; VF8UF1-NEXT: store i16 0, ptr [[TMP10]], align 2 +; VF8UF1-NEXT: [[TMP12:%.*]] = getelementptr i16, ptr [[DST]], i64 6 +; VF8UF1-NEXT: store i16 0, ptr [[TMP12]], align 2 ; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE8]] ; VF8UF1: [[PRED_STORE_CONTINUE8]]: -; VF8UF1-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP2]], i32 4 -; VF8UF1-NEXT: br i1 [[TMP11]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] +; VF8UF1-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP2]], i32 5 +; VF8UF1-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] ; VF8UF1: [[PRED_STORE_IF9]]: -; VF8UF1-NEXT: [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 4 -; VF8UF1-NEXT: [[TMP12:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP26]] -; VF8UF1-NEXT: store i16 0, ptr [[TMP12]], align 2 +; VF8UF1-NEXT: [[TMP14:%.*]] = getelementptr i16, ptr [[DST]], i64 7 +; VF8UF1-NEXT: store i16 0, ptr [[TMP14]], align 2 ; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE10]] ; VF8UF1: [[PRED_STORE_CONTINUE10]]: -; VF8UF1-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP2]], i32 5 -; VF8UF1-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]] +; VF8UF1-NEXT: [[TMP15:%.*]] = extractelement <8 x i1> [[TMP2]], i32 6 +; VF8UF1-NEXT: br i1 [[TMP15]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]] ; VF8UF1: [[PRED_STORE_IF11]]: -; VF8UF1-NEXT: [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 5 -; VF8UF1-NEXT: [[TMP14:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP19]] -; VF8UF1-NEXT: store i16 0, ptr [[TMP14]], align 2 +; VF8UF1-NEXT: [[TMP16:%.*]] = getelementptr i16, ptr [[DST]], i64 8 +; VF8UF1-NEXT: store i16 0, ptr [[TMP16]], align 2 ; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE12]] ; VF8UF1: [[PRED_STORE_CONTINUE12]]: -; VF8UF1-NEXT: [[TMP15:%.*]] = extractelement <8 x i1> [[TMP2]], i32 6 -; VF8UF1-NEXT: br i1 [[TMP15]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]] +; VF8UF1-NEXT: [[TMP17:%.*]] = extractelement <8 x i1> [[TMP2]], i32 7 +; VF8UF1-NEXT: br i1 [[TMP17]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]] ; VF8UF1: [[PRED_STORE_IF13]]: -; VF8UF1-NEXT: [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 6 -; VF8UF1-NEXT: [[TMP16:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP22]] -; VF8UF1-NEXT: store i16 0, ptr [[TMP16]], align 2 +; VF8UF1-NEXT: [[TMP18:%.*]] = getelementptr i16, ptr [[DST]], i64 9 +; VF8UF1-NEXT: store i16 0, ptr [[TMP18]], align 2 ; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE14]] ; VF8UF1: [[PRED_STORE_CONTINUE14]]: -; VF8UF1-NEXT: [[TMP17:%.*]] = extractelement <8 x i1> [[TMP2]], i32 7 -; VF8UF1-NEXT: br i1 [[TMP17]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16]] -; VF8UF1: [[PRED_STORE_IF15]]: -; VF8UF1-NEXT: [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 7 -; VF8UF1-NEXT: [[TMP18:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP25]] -; VF8UF1-NEXT: store i16 0, ptr [[TMP18]], align 2 -; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE16]] -; VF8UF1: [[PRED_STORE_CONTINUE16]]: -; VF8UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; VF8UF1-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VF8UF1-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VF8UF1: [[MIDDLE_BLOCK]]: ; VF8UF1-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; VF8UF1: [[SCALAR_PH]]: @@ -128,7 +238,7 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, ; VF8UF1-NEXT: store i16 0, ptr [[GEP_DST]], align 2 ; VF8UF1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; VF8UF1-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; VF8UF1-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; VF8UF1-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; VF8UF1: [[EXIT]]: ; VF8UF1-NEXT: ret void ; @@ -143,148 +253,125 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, ; VF8UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; VF8UF2-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1 ; VF8UF2-NEXT: [[TMP1:%.*]] = add i64 2, [[N_VEC]] -; VF8UF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; VF8UF2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer +; VF8UF2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; VF8UF2-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer ; VF8UF2-NEXT: br label %[[VECTOR_BODY:.*]] ; VF8UF2: [[VECTOR_BODY]]: -; VF8UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE35:.*]] ] -; VF8UF2-NEXT: [[OFFSET_IDX:%.*]] = add i64 2, [[INDEX]] -; VF8UF2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i64 0 -; VF8UF2-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer -; VF8UF2-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT1]], -; VF8UF2-NEXT: [[VEC_IV3:%.*]] = add <8 x i64> [[BROADCAST_SPLAT1]], -; VF8UF2-NEXT: [[TMP2:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] -; VF8UF2-NEXT: [[TMP3:%.*]] = icmp ule <8 x i64> [[VEC_IV3]], [[BROADCAST_SPLAT]] +; VF8UF2-NEXT: [[TMP2:%.*]] = icmp ule <8 x i64> , [[BROADCAST_SPLAT1]] +; VF8UF2-NEXT: [[TMP3:%.*]] = icmp ule <8 x i64> , [[BROADCAST_SPLAT1]] ; VF8UF2-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP2]], i32 0 ; VF8UF2-NEXT: br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; VF8UF2: [[PRED_STORE_IF]]: -; VF8UF2-NEXT: [[TMP36:%.*]] = add i64 [[OFFSET_IDX]], 0 -; VF8UF2-NEXT: [[TMP5:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP36]] +; VF8UF2-NEXT: [[TMP5:%.*]] = getelementptr i16, ptr [[DST]], i64 2 ; VF8UF2-NEXT: store i16 0, ptr [[TMP5]], align 2 ; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE]] ; VF8UF2: [[PRED_STORE_CONTINUE]]: ; VF8UF2-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP2]], i32 1 -; VF8UF2-NEXT: br i1 [[TMP6]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7:.*]] -; VF8UF2: [[PRED_STORE_IF6]]: -; VF8UF2-NEXT: [[TMP37:%.*]] = add i64 [[OFFSET_IDX]], 1 -; VF8UF2-NEXT: [[TMP7:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP37]] +; VF8UF2-NEXT: br i1 [[TMP6]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]] +; VF8UF2: [[PRED_STORE_IF1]]: +; VF8UF2-NEXT: [[TMP7:%.*]] = getelementptr i16, ptr [[DST]], i64 3 ; VF8UF2-NEXT: store i16 0, ptr [[TMP7]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE7]] -; VF8UF2: [[PRED_STORE_CONTINUE7]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE2]] +; VF8UF2: [[PRED_STORE_CONTINUE2]]: ; VF8UF2-NEXT: [[TMP8:%.*]] = extractelement <8 x i1> [[TMP2]], i32 2 -; VF8UF2-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]] -; VF8UF2: [[PRED_STORE_IF8]]: -; VF8UF2-NEXT: [[TMP39:%.*]] = add i64 [[OFFSET_IDX]], 2 -; VF8UF2-NEXT: [[TMP9:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP39]] +; VF8UF2-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] +; VF8UF2: [[PRED_STORE_IF3]]: +; VF8UF2-NEXT: [[TMP9:%.*]] = getelementptr i16, ptr [[DST]], i64 4 ; VF8UF2-NEXT: store i16 0, ptr [[TMP9]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE9]] -; VF8UF2: [[PRED_STORE_CONTINUE9]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE4]] +; VF8UF2: [[PRED_STORE_CONTINUE4]]: ; VF8UF2-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP2]], i32 3 -; VF8UF2-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF10:.*]], label %[[PRED_STORE_CONTINUE11:.*]] -; VF8UF2: [[PRED_STORE_IF10]]: -; VF8UF2-NEXT: [[TMP40:%.*]] = add i64 [[OFFSET_IDX]], 3 -; VF8UF2-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP40]] +; VF8UF2-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]] +; VF8UF2: [[PRED_STORE_IF5]]: +; VF8UF2-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[DST]], i64 5 ; VF8UF2-NEXT: store i16 0, ptr [[TMP11]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE11]] -; VF8UF2: [[PRED_STORE_CONTINUE11]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE6]] +; VF8UF2: [[PRED_STORE_CONTINUE6]]: ; VF8UF2-NEXT: [[TMP12:%.*]] = extractelement <8 x i1> [[TMP2]], i32 4 -; VF8UF2-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF12:.*]], label %[[PRED_STORE_CONTINUE13:.*]] -; VF8UF2: [[PRED_STORE_IF12]]: -; VF8UF2-NEXT: [[TMP42:%.*]] = add i64 [[OFFSET_IDX]], 4 -; VF8UF2-NEXT: [[TMP13:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP42]] +; VF8UF2-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] +; VF8UF2: [[PRED_STORE_IF7]]: +; VF8UF2-NEXT: [[TMP13:%.*]] = getelementptr i16, ptr [[DST]], i64 6 ; VF8UF2-NEXT: store i16 0, ptr [[TMP13]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE13]] -; VF8UF2: [[PRED_STORE_CONTINUE13]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE8]] +; VF8UF2: [[PRED_STORE_CONTINUE8]]: ; VF8UF2-NEXT: [[TMP14:%.*]] = extractelement <8 x i1> [[TMP2]], i32 5 -; VF8UF2-NEXT: br i1 [[TMP14]], label %[[PRED_STORE_IF14:.*]], label %[[PRED_STORE_CONTINUE15:.*]] -; VF8UF2: [[PRED_STORE_IF14]]: -; VF8UF2-NEXT: [[TMP43:%.*]] = add i64 [[OFFSET_IDX]], 5 -; VF8UF2-NEXT: [[TMP15:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP43]] +; VF8UF2-NEXT: br i1 [[TMP14]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] +; VF8UF2: [[PRED_STORE_IF9]]: +; VF8UF2-NEXT: [[TMP15:%.*]] = getelementptr i16, ptr [[DST]], i64 7 ; VF8UF2-NEXT: store i16 0, ptr [[TMP15]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE15]] -; VF8UF2: [[PRED_STORE_CONTINUE15]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE10]] +; VF8UF2: [[PRED_STORE_CONTINUE10]]: ; VF8UF2-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP2]], i32 6 -; VF8UF2-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF16:.*]], label %[[PRED_STORE_CONTINUE17:.*]] -; VF8UF2: [[PRED_STORE_IF16]]: -; VF8UF2-NEXT: [[TMP45:%.*]] = add i64 [[OFFSET_IDX]], 6 -; VF8UF2-NEXT: [[TMP17:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP45]] +; VF8UF2-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]] +; VF8UF2: [[PRED_STORE_IF11]]: +; VF8UF2-NEXT: [[TMP17:%.*]] = getelementptr i16, ptr [[DST]], i64 8 ; VF8UF2-NEXT: store i16 0, ptr [[TMP17]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE17]] -; VF8UF2: [[PRED_STORE_CONTINUE17]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE12]] +; VF8UF2: [[PRED_STORE_CONTINUE12]]: ; VF8UF2-NEXT: [[TMP18:%.*]] = extractelement <8 x i1> [[TMP2]], i32 7 -; VF8UF2-NEXT: br i1 [[TMP18]], label %[[PRED_STORE_IF18:.*]], label %[[PRED_STORE_CONTINUE19:.*]] -; VF8UF2: [[PRED_STORE_IF18]]: -; VF8UF2-NEXT: [[TMP46:%.*]] = add i64 [[OFFSET_IDX]], 7 -; VF8UF2-NEXT: [[TMP19:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP46]] +; VF8UF2-NEXT: br i1 [[TMP18]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]] +; VF8UF2: [[PRED_STORE_IF13]]: +; VF8UF2-NEXT: [[TMP19:%.*]] = getelementptr i16, ptr [[DST]], i64 9 ; VF8UF2-NEXT: store i16 0, ptr [[TMP19]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE19]] -; VF8UF2: [[PRED_STORE_CONTINUE19]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE14]] +; VF8UF2: [[PRED_STORE_CONTINUE14]]: ; VF8UF2-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0 -; VF8UF2-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF20:.*]], label %[[PRED_STORE_CONTINUE21:.*]] -; VF8UF2: [[PRED_STORE_IF20]]: -; VF8UF2-NEXT: [[TMP48:%.*]] = add i64 [[OFFSET_IDX]], 8 -; VF8UF2-NEXT: [[TMP21:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP48]] +; VF8UF2-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]] +; VF8UF2: [[PRED_STORE_IF15]]: +; VF8UF2-NEXT: [[TMP21:%.*]] = getelementptr i16, ptr [[DST]], i64 10 ; VF8UF2-NEXT: store i16 0, ptr [[TMP21]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE21]] -; VF8UF2: [[PRED_STORE_CONTINUE21]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE16]] +; VF8UF2: [[PRED_STORE_CONTINUE16]]: ; VF8UF2-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP3]], i32 1 -; VF8UF2-NEXT: br i1 [[TMP22]], label %[[PRED_STORE_IF22:.*]], label %[[PRED_STORE_CONTINUE23:.*]] -; VF8UF2: [[PRED_STORE_IF22]]: -; VF8UF2-NEXT: [[TMP49:%.*]] = add i64 [[OFFSET_IDX]], 9 -; VF8UF2-NEXT: [[TMP23:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP49]] +; VF8UF2-NEXT: br i1 [[TMP22]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]] +; VF8UF2: [[PRED_STORE_IF17]]: +; VF8UF2-NEXT: [[TMP23:%.*]] = getelementptr i16, ptr [[DST]], i64 11 ; VF8UF2-NEXT: store i16 0, ptr [[TMP23]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE23]] -; VF8UF2: [[PRED_STORE_CONTINUE23]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE18]] +; VF8UF2: [[PRED_STORE_CONTINUE18]]: ; VF8UF2-NEXT: [[TMP24:%.*]] = extractelement <8 x i1> [[TMP3]], i32 2 -; VF8UF2-NEXT: br i1 [[TMP24]], label %[[PRED_STORE_IF24:.*]], label %[[PRED_STORE_CONTINUE25:.*]] -; VF8UF2: [[PRED_STORE_IF24]]: -; VF8UF2-NEXT: [[TMP51:%.*]] = add i64 [[OFFSET_IDX]], 10 -; VF8UF2-NEXT: [[TMP25:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP51]] +; VF8UF2-NEXT: br i1 [[TMP24]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]] +; VF8UF2: [[PRED_STORE_IF19]]: +; VF8UF2-NEXT: [[TMP25:%.*]] = getelementptr i16, ptr [[DST]], i64 12 ; VF8UF2-NEXT: store i16 0, ptr [[TMP25]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE25]] -; VF8UF2: [[PRED_STORE_CONTINUE25]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE20]] +; VF8UF2: [[PRED_STORE_CONTINUE20]]: ; VF8UF2-NEXT: [[TMP26:%.*]] = extractelement <8 x i1> [[TMP3]], i32 3 -; VF8UF2-NEXT: br i1 [[TMP26]], label %[[PRED_STORE_IF26:.*]], label %[[PRED_STORE_CONTINUE27:.*]] -; VF8UF2: [[PRED_STORE_IF26]]: -; VF8UF2-NEXT: [[TMP38:%.*]] = add i64 [[OFFSET_IDX]], 11 -; VF8UF2-NEXT: [[TMP27:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP38]] +; VF8UF2-NEXT: br i1 [[TMP26]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]] +; VF8UF2: [[PRED_STORE_IF21]]: +; VF8UF2-NEXT: [[TMP27:%.*]] = getelementptr i16, ptr [[DST]], i64 13 ; VF8UF2-NEXT: store i16 0, ptr [[TMP27]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE27]] -; VF8UF2: [[PRED_STORE_CONTINUE27]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE22]] +; VF8UF2: [[PRED_STORE_CONTINUE22]]: ; VF8UF2-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP3]], i32 4 -; VF8UF2-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF28:.*]], label %[[PRED_STORE_CONTINUE29:.*]] -; VF8UF2: [[PRED_STORE_IF28]]: -; VF8UF2-NEXT: [[TMP41:%.*]] = add i64 [[OFFSET_IDX]], 12 -; VF8UF2-NEXT: [[TMP29:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP41]] +; VF8UF2-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]] +; VF8UF2: [[PRED_STORE_IF23]]: +; VF8UF2-NEXT: [[TMP29:%.*]] = getelementptr i16, ptr [[DST]], i64 14 ; VF8UF2-NEXT: store i16 0, ptr [[TMP29]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE29]] -; VF8UF2: [[PRED_STORE_CONTINUE29]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE24]] +; VF8UF2: [[PRED_STORE_CONTINUE24]]: ; VF8UF2-NEXT: [[TMP30:%.*]] = extractelement <8 x i1> [[TMP3]], i32 5 -; VF8UF2-NEXT: br i1 [[TMP30]], label %[[PRED_STORE_IF30:.*]], label %[[PRED_STORE_CONTINUE31:.*]] -; VF8UF2: [[PRED_STORE_IF30]]: -; VF8UF2-NEXT: [[TMP44:%.*]] = add i64 [[OFFSET_IDX]], 13 -; VF8UF2-NEXT: [[TMP31:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP44]] +; VF8UF2-NEXT: br i1 [[TMP30]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]] +; VF8UF2: [[PRED_STORE_IF25]]: +; VF8UF2-NEXT: [[TMP31:%.*]] = getelementptr i16, ptr [[DST]], i64 15 ; VF8UF2-NEXT: store i16 0, ptr [[TMP31]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE31]] -; VF8UF2: [[PRED_STORE_CONTINUE31]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE26]] +; VF8UF2: [[PRED_STORE_CONTINUE26]]: ; VF8UF2-NEXT: [[TMP32:%.*]] = extractelement <8 x i1> [[TMP3]], i32 6 -; VF8UF2-NEXT: br i1 [[TMP32]], label %[[PRED_STORE_IF32:.*]], label %[[PRED_STORE_CONTINUE33:.*]] -; VF8UF2: [[PRED_STORE_IF32]]: -; VF8UF2-NEXT: [[TMP47:%.*]] = add i64 [[OFFSET_IDX]], 14 -; VF8UF2-NEXT: [[TMP33:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP47]] +; VF8UF2-NEXT: br i1 [[TMP32]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]] +; VF8UF2: [[PRED_STORE_IF27]]: +; VF8UF2-NEXT: [[TMP33:%.*]] = getelementptr i16, ptr [[DST]], i64 16 ; VF8UF2-NEXT: store i16 0, ptr [[TMP33]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE33]] -; VF8UF2: [[PRED_STORE_CONTINUE33]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE28]] +; VF8UF2: [[PRED_STORE_CONTINUE28]]: ; VF8UF2-NEXT: [[TMP34:%.*]] = extractelement <8 x i1> [[TMP3]], i32 7 -; VF8UF2-NEXT: br i1 [[TMP34]], label %[[PRED_STORE_IF34:.*]], label %[[PRED_STORE_CONTINUE35]] -; VF8UF2: [[PRED_STORE_IF34]]: -; VF8UF2-NEXT: [[TMP50:%.*]] = add i64 [[OFFSET_IDX]], 15 -; VF8UF2-NEXT: [[TMP35:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP50]] +; VF8UF2-NEXT: br i1 [[TMP34]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]] +; VF8UF2: [[PRED_STORE_IF29]]: +; VF8UF2-NEXT: [[TMP35:%.*]] = getelementptr i16, ptr [[DST]], i64 17 ; VF8UF2-NEXT: store i16 0, ptr [[TMP35]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE35]] -; VF8UF2: [[PRED_STORE_CONTINUE35]]: -; VF8UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; VF8UF2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE30]] +; VF8UF2: [[PRED_STORE_CONTINUE30]]: +; VF8UF2-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VF8UF2: [[MIDDLE_BLOCK]]: ; VF8UF2-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; VF8UF2: [[SCALAR_PH]]: @@ -296,7 +383,7 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, ; VF8UF2-NEXT: store i16 0, ptr [[GEP_DST]], align 2 ; VF8UF2-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; VF8UF2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; VF8UF2-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; VF8UF2-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; VF8UF2: [[EXIT]]: ; VF8UF2-NEXT: ret void ; @@ -311,146 +398,124 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, ; VF16UF1-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; VF16UF1-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1 ; VF16UF1-NEXT: [[TMP1:%.*]] = add i64 2, [[N_VEC]] -; VF16UF1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; VF16UF1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer +; VF16UF1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; VF16UF1-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer ; VF16UF1-NEXT: br label %[[VECTOR_BODY:.*]] ; VF16UF1: [[VECTOR_BODY]]: -; VF16UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE32:.*]] ] -; VF16UF1-NEXT: [[OFFSET_IDX:%.*]] = add i64 2, [[INDEX]] -; VF16UF1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[INDEX]], i64 0 -; VF16UF1-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer -; VF16UF1-NEXT: [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT1]], -; VF16UF1-NEXT: [[TMP2:%.*]] = icmp ule <16 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] +; VF16UF1-NEXT: [[TMP2:%.*]] = icmp ule <16 x i64> , [[BROADCAST_SPLAT1]] ; VF16UF1-NEXT: [[TMP3:%.*]] = extractelement <16 x i1> [[TMP2]], i32 0 ; VF16UF1-NEXT: br i1 [[TMP3]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; VF16UF1: [[PRED_STORE_IF]]: -; VF16UF1-NEXT: [[TMP35:%.*]] = add i64 [[OFFSET_IDX]], 0 -; VF16UF1-NEXT: [[TMP4:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP35]] +; VF16UF1-NEXT: [[TMP4:%.*]] = getelementptr i16, ptr [[DST]], i64 2 ; VF16UF1-NEXT: store i16 0, ptr [[TMP4]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE]] ; VF16UF1: [[PRED_STORE_CONTINUE]]: ; VF16UF1-NEXT: [[TMP5:%.*]] = extractelement <16 x i1> [[TMP2]], i32 1 -; VF16UF1-NEXT: br i1 [[TMP5]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] -; VF16UF1: [[PRED_STORE_IF3]]: -; VF16UF1-NEXT: [[TMP36:%.*]] = add i64 [[OFFSET_IDX]], 1 -; VF16UF1-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP36]] +; VF16UF1-NEXT: br i1 [[TMP5]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]] +; VF16UF1: [[PRED_STORE_IF1]]: +; VF16UF1-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[DST]], i64 3 ; VF16UF1-NEXT: store i16 0, ptr [[TMP6]], align 2 +; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE2]] +; VF16UF1: [[PRED_STORE_CONTINUE2]]: +; VF16UF1-NEXT: [[TMP7:%.*]] = extractelement <16 x i1> [[TMP2]], i32 2 +; VF16UF1-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] +; VF16UF1: [[PRED_STORE_IF3]]: +; VF16UF1-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 4 +; VF16UF1-NEXT: store i16 0, ptr [[TMP8]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE4]] ; VF16UF1: [[PRED_STORE_CONTINUE4]]: -; VF16UF1-NEXT: [[TMP7:%.*]] = extractelement <16 x i1> [[TMP2]], i32 2 -; VF16UF1-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]] +; VF16UF1-NEXT: [[TMP9:%.*]] = extractelement <16 x i1> [[TMP2]], i32 3 +; VF16UF1-NEXT: br i1 [[TMP9]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]] ; VF16UF1: [[PRED_STORE_IF5]]: -; VF16UF1-NEXT: [[TMP38:%.*]] = add i64 [[OFFSET_IDX]], 2 -; VF16UF1-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP38]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP8]], align 2 +; VF16UF1-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 5 +; VF16UF1-NEXT: store i16 0, ptr [[TMP10]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE6]] ; VF16UF1: [[PRED_STORE_CONTINUE6]]: -; VF16UF1-NEXT: [[TMP9:%.*]] = extractelement <16 x i1> [[TMP2]], i32 3 -; VF16UF1-NEXT: br i1 [[TMP9]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] +; VF16UF1-NEXT: [[TMP11:%.*]] = extractelement <16 x i1> [[TMP2]], i32 4 +; VF16UF1-NEXT: br i1 [[TMP11]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] ; VF16UF1: [[PRED_STORE_IF7]]: -; VF16UF1-NEXT: [[TMP39:%.*]] = add i64 [[OFFSET_IDX]], 3 -; VF16UF1-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP39]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP10]], align 2 +; VF16UF1-NEXT: [[TMP12:%.*]] = getelementptr i16, ptr [[DST]], i64 6 +; VF16UF1-NEXT: store i16 0, ptr [[TMP12]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE8]] ; VF16UF1: [[PRED_STORE_CONTINUE8]]: -; VF16UF1-NEXT: [[TMP11:%.*]] = extractelement <16 x i1> [[TMP2]], i32 4 -; VF16UF1-NEXT: br i1 [[TMP11]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] +; VF16UF1-NEXT: [[TMP13:%.*]] = extractelement <16 x i1> [[TMP2]], i32 5 +; VF16UF1-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] ; VF16UF1: [[PRED_STORE_IF9]]: -; VF16UF1-NEXT: [[TMP41:%.*]] = add i64 [[OFFSET_IDX]], 4 -; VF16UF1-NEXT: [[TMP12:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP41]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP12]], align 2 +; VF16UF1-NEXT: [[TMP14:%.*]] = getelementptr i16, ptr [[DST]], i64 7 +; VF16UF1-NEXT: store i16 0, ptr [[TMP14]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE10]] ; VF16UF1: [[PRED_STORE_CONTINUE10]]: -; VF16UF1-NEXT: [[TMP13:%.*]] = extractelement <16 x i1> [[TMP2]], i32 5 -; VF16UF1-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]] +; VF16UF1-NEXT: [[TMP15:%.*]] = extractelement <16 x i1> [[TMP2]], i32 6 +; VF16UF1-NEXT: br i1 [[TMP15]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]] ; VF16UF1: [[PRED_STORE_IF11]]: -; VF16UF1-NEXT: [[TMP42:%.*]] = add i64 [[OFFSET_IDX]], 5 -; VF16UF1-NEXT: [[TMP14:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP42]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP14]], align 2 +; VF16UF1-NEXT: [[TMP16:%.*]] = getelementptr i16, ptr [[DST]], i64 8 +; VF16UF1-NEXT: store i16 0, ptr [[TMP16]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE12]] ; VF16UF1: [[PRED_STORE_CONTINUE12]]: -; VF16UF1-NEXT: [[TMP15:%.*]] = extractelement <16 x i1> [[TMP2]], i32 6 -; VF16UF1-NEXT: br i1 [[TMP15]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]] +; VF16UF1-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP2]], i32 7 +; VF16UF1-NEXT: br i1 [[TMP17]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]] ; VF16UF1: [[PRED_STORE_IF13]]: -; VF16UF1-NEXT: [[TMP44:%.*]] = add i64 [[OFFSET_IDX]], 6 -; VF16UF1-NEXT: [[TMP16:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP44]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP16]], align 2 +; VF16UF1-NEXT: [[TMP18:%.*]] = getelementptr i16, ptr [[DST]], i64 9 +; VF16UF1-NEXT: store i16 0, ptr [[TMP18]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE14]] ; VF16UF1: [[PRED_STORE_CONTINUE14]]: -; VF16UF1-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP2]], i32 7 -; VF16UF1-NEXT: br i1 [[TMP17]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]] +; VF16UF1-NEXT: [[TMP19:%.*]] = extractelement <16 x i1> [[TMP2]], i32 8 +; VF16UF1-NEXT: br i1 [[TMP19]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]] ; VF16UF1: [[PRED_STORE_IF15]]: -; VF16UF1-NEXT: [[TMP45:%.*]] = add i64 [[OFFSET_IDX]], 7 -; VF16UF1-NEXT: [[TMP18:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP45]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP18]], align 2 +; VF16UF1-NEXT: [[TMP20:%.*]] = getelementptr i16, ptr [[DST]], i64 10 +; VF16UF1-NEXT: store i16 0, ptr [[TMP20]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE16]] ; VF16UF1: [[PRED_STORE_CONTINUE16]]: -; VF16UF1-NEXT: [[TMP19:%.*]] = extractelement <16 x i1> [[TMP2]], i32 8 -; VF16UF1-NEXT: br i1 [[TMP19]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]] +; VF16UF1-NEXT: [[TMP21:%.*]] = extractelement <16 x i1> [[TMP2]], i32 9 +; VF16UF1-NEXT: br i1 [[TMP21]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]] ; VF16UF1: [[PRED_STORE_IF17]]: -; VF16UF1-NEXT: [[TMP47:%.*]] = add i64 [[OFFSET_IDX]], 8 -; VF16UF1-NEXT: [[TMP20:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP47]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP20]], align 2 +; VF16UF1-NEXT: [[TMP22:%.*]] = getelementptr i16, ptr [[DST]], i64 11 +; VF16UF1-NEXT: store i16 0, ptr [[TMP22]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE18]] ; VF16UF1: [[PRED_STORE_CONTINUE18]]: -; VF16UF1-NEXT: [[TMP21:%.*]] = extractelement <16 x i1> [[TMP2]], i32 9 -; VF16UF1-NEXT: br i1 [[TMP21]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]] +; VF16UF1-NEXT: [[TMP23:%.*]] = extractelement <16 x i1> [[TMP2]], i32 10 +; VF16UF1-NEXT: br i1 [[TMP23]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]] ; VF16UF1: [[PRED_STORE_IF19]]: -; VF16UF1-NEXT: [[TMP48:%.*]] = add i64 [[OFFSET_IDX]], 9 -; VF16UF1-NEXT: [[TMP22:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP48]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP22]], align 2 +; VF16UF1-NEXT: [[TMP24:%.*]] = getelementptr i16, ptr [[DST]], i64 12 +; VF16UF1-NEXT: store i16 0, ptr [[TMP24]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE20]] ; VF16UF1: [[PRED_STORE_CONTINUE20]]: -; VF16UF1-NEXT: [[TMP23:%.*]] = extractelement <16 x i1> [[TMP2]], i32 10 -; VF16UF1-NEXT: br i1 [[TMP23]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]] +; VF16UF1-NEXT: [[TMP25:%.*]] = extractelement <16 x i1> [[TMP2]], i32 11 +; VF16UF1-NEXT: br i1 [[TMP25]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]] ; VF16UF1: [[PRED_STORE_IF21]]: -; VF16UF1-NEXT: [[TMP50:%.*]] = add i64 [[OFFSET_IDX]], 10 -; VF16UF1-NEXT: [[TMP24:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP50]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP24]], align 2 +; VF16UF1-NEXT: [[TMP26:%.*]] = getelementptr i16, ptr [[DST]], i64 13 +; VF16UF1-NEXT: store i16 0, ptr [[TMP26]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE22]] ; VF16UF1: [[PRED_STORE_CONTINUE22]]: -; VF16UF1-NEXT: [[TMP25:%.*]] = extractelement <16 x i1> [[TMP2]], i32 11 -; VF16UF1-NEXT: br i1 [[TMP25]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]] +; VF16UF1-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP2]], i32 12 +; VF16UF1-NEXT: br i1 [[TMP27]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]] ; VF16UF1: [[PRED_STORE_IF23]]: -; VF16UF1-NEXT: [[TMP37:%.*]] = add i64 [[OFFSET_IDX]], 11 -; VF16UF1-NEXT: [[TMP26:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP37]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP26]], align 2 +; VF16UF1-NEXT: [[TMP28:%.*]] = getelementptr i16, ptr [[DST]], i64 14 +; VF16UF1-NEXT: store i16 0, ptr [[TMP28]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE24]] ; VF16UF1: [[PRED_STORE_CONTINUE24]]: -; VF16UF1-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP2]], i32 12 -; VF16UF1-NEXT: br i1 [[TMP27]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]] +; VF16UF1-NEXT: [[TMP29:%.*]] = extractelement <16 x i1> [[TMP2]], i32 13 +; VF16UF1-NEXT: br i1 [[TMP29]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]] ; VF16UF1: [[PRED_STORE_IF25]]: -; VF16UF1-NEXT: [[TMP40:%.*]] = add i64 [[OFFSET_IDX]], 12 -; VF16UF1-NEXT: [[TMP28:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP40]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP28]], align 2 +; VF16UF1-NEXT: [[TMP30:%.*]] = getelementptr i16, ptr [[DST]], i64 15 +; VF16UF1-NEXT: store i16 0, ptr [[TMP30]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE26]] ; VF16UF1: [[PRED_STORE_CONTINUE26]]: -; VF16UF1-NEXT: [[TMP29:%.*]] = extractelement <16 x i1> [[TMP2]], i32 13 -; VF16UF1-NEXT: br i1 [[TMP29]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]] +; VF16UF1-NEXT: [[TMP31:%.*]] = extractelement <16 x i1> [[TMP2]], i32 14 +; VF16UF1-NEXT: br i1 [[TMP31]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]] ; VF16UF1: [[PRED_STORE_IF27]]: -; VF16UF1-NEXT: [[TMP43:%.*]] = add i64 [[OFFSET_IDX]], 13 -; VF16UF1-NEXT: [[TMP30:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP43]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP30]], align 2 +; VF16UF1-NEXT: [[TMP32:%.*]] = getelementptr i16, ptr [[DST]], i64 16 +; VF16UF1-NEXT: store i16 0, ptr [[TMP32]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE28]] ; VF16UF1: [[PRED_STORE_CONTINUE28]]: -; VF16UF1-NEXT: [[TMP31:%.*]] = extractelement <16 x i1> [[TMP2]], i32 14 -; VF16UF1-NEXT: br i1 [[TMP31]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]] +; VF16UF1-NEXT: [[TMP33:%.*]] = extractelement <16 x i1> [[TMP2]], i32 15 +; VF16UF1-NEXT: br i1 [[TMP33]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]] ; VF16UF1: [[PRED_STORE_IF29]]: -; VF16UF1-NEXT: [[TMP46:%.*]] = add i64 [[OFFSET_IDX]], 14 -; VF16UF1-NEXT: [[TMP32:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP46]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP32]], align 2 +; VF16UF1-NEXT: [[TMP34:%.*]] = getelementptr i16, ptr [[DST]], i64 17 +; VF16UF1-NEXT: store i16 0, ptr [[TMP34]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE30]] ; VF16UF1: [[PRED_STORE_CONTINUE30]]: -; VF16UF1-NEXT: [[TMP33:%.*]] = extractelement <16 x i1> [[TMP2]], i32 15 -; VF16UF1-NEXT: br i1 [[TMP33]], label %[[PRED_STORE_IF31:.*]], label %[[PRED_STORE_CONTINUE32]] -; VF16UF1: [[PRED_STORE_IF31]]: -; VF16UF1-NEXT: [[TMP49:%.*]] = add i64 [[OFFSET_IDX]], 15 -; VF16UF1-NEXT: [[TMP34:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP49]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP34]], align 2 -; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE32]] -; VF16UF1: [[PRED_STORE_CONTINUE32]]: -; VF16UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; VF16UF1-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VF16UF1-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VF16UF1: [[MIDDLE_BLOCK]]: ; VF16UF1-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; VF16UF1: [[SCALAR_PH]]: @@ -462,7 +527,7 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, ; VF16UF1-NEXT: store i16 0, ptr [[GEP_DST]], align 2 ; VF16UF1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; VF16UF1-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; VF16UF1-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; VF16UF1-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; VF16UF1: [[EXIT]]: ; VF16UF1-NEXT: ret void ; @@ -506,7 +571,7 @@ define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias ; VF8UF1-NEXT: store <8 x i8> [[WIDE_LOAD]], ptr [[TMP4]], align 1 ; VF8UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; VF8UF1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VF8UF1-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; VF8UF1-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; VF8UF1: [[MIDDLE_BLOCK]]: ; VF8UF1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; VF8UF1-NEXT: br i1 [[CMP_N]], label %[[OUTER_LATCH]], label %[[SCALAR_PH]] @@ -521,7 +586,7 @@ define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias ; VF8UF1-NEXT: store i8 [[L]], ptr [[GEP_DST]], align 1 ; VF8UF1-NEXT: [[IV_NEXT]] = add i64 [[INNER_IV]], 1 ; VF8UF1-NEXT: [[C_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; VF8UF1-NEXT: br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP7:![0-9]+]] +; VF8UF1-NEXT: br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP6:![0-9]+]] ; VF8UF1: [[OUTER_LATCH]]: ; VF8UF1-NEXT: [[OUTER_IV_NEXT]] = getelementptr i8, ptr [[OUTER_IV]], i64 1 ; VF8UF1-NEXT: [[C_2:%.*]] = call i1 @cond() @@ -542,20 +607,17 @@ define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias ; VF8UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; VF8UF2-NEXT: br label %[[VECTOR_BODY:.*]] ; VF8UF2: [[VECTOR_BODY]]: -; VF8UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; VF8UF2-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; VF8UF2-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[TMP6]] +; VF8UF2-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 0 ; VF8UF2-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 ; VF8UF2-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 ; VF8UF2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1 ; VF8UF2-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 -; VF8UF2-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]] +; VF8UF2-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 0 ; VF8UF2-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0 ; VF8UF2-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP3]], i32 8 ; VF8UF2-NEXT: store <8 x i8> [[WIDE_LOAD]], ptr [[TMP4]], align 1 ; VF8UF2-NEXT: store <8 x i8> [[WIDE_LOAD1]], ptr [[TMP5]], align 1 -; VF8UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; VF8UF2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; VF8UF2-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VF8UF2: [[MIDDLE_BLOCK]]: ; VF8UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; VF8UF2-NEXT: br i1 [[CMP_N]], label %[[OUTER_LATCH]], label %[[SCALAR_PH]] @@ -570,7 +632,7 @@ define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias ; VF8UF2-NEXT: store i8 [[L]], ptr [[GEP_DST]], align 1 ; VF8UF2-NEXT: [[IV_NEXT]] = add i64 [[INNER_IV]], 1 ; VF8UF2-NEXT: [[C_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; VF8UF2-NEXT: br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP7:![0-9]+]] +; VF8UF2-NEXT: br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP4:![0-9]+]] ; VF8UF2: [[OUTER_LATCH]]: ; VF8UF2-NEXT: [[OUTER_IV_NEXT]] = getelementptr i8, ptr [[OUTER_IV]], i64 1 ; VF8UF2-NEXT: [[C_2:%.*]] = call i1 @cond() @@ -591,16 +653,13 @@ define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias ; VF16UF1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; VF16UF1-NEXT: br label %[[VECTOR_BODY:.*]] ; VF16UF1: [[VECTOR_BODY]]: -; VF16UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; VF16UF1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; VF16UF1-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[TMP4]] +; VF16UF1-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 0 ; VF16UF1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 ; VF16UF1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 -; VF16UF1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP4]] +; VF16UF1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 0 ; VF16UF1-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP2]], i32 0 ; VF16UF1-NEXT: store <16 x i8> [[WIDE_LOAD]], ptr [[TMP3]], align 1 -; VF16UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; VF16UF1-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; VF16UF1-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VF16UF1: [[MIDDLE_BLOCK]]: ; VF16UF1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; VF16UF1-NEXT: br i1 [[CMP_N]], label %[[OUTER_LATCH]], label %[[SCALAR_PH]] @@ -615,7 +674,7 @@ define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias ; VF16UF1-NEXT: store i8 [[L]], ptr [[GEP_DST]], align 1 ; VF16UF1-NEXT: [[IV_NEXT]] = add i64 [[INNER_IV]], 1 ; VF16UF1-NEXT: [[C_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; VF16UF1-NEXT: br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP7:![0-9]+]] +; VF16UF1-NEXT: br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP4:![0-9]+]] ; VF16UF1: [[OUTER_LATCH]]: ; VF16UF1-NEXT: [[OUTER_IV_NEXT]] = getelementptr i8, ptr [[OUTER_IV]], i64 1 ; VF16UF1-NEXT: [[C_2:%.*]] = call i1 @cond() @@ -653,28 +712,19 @@ exit: ; VF8UF1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; VF8UF1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} ; VF8UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} -; VF8UF1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} -; VF8UF1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} -; VF8UF1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} -; VF8UF1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; VF8UF1: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]} +; VF8UF1: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]} +; VF8UF1: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]} ;. ; VF8UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; VF8UF2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; VF8UF2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; VF8UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; VF8UF2: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"} +; VF8UF2: [[META2]] = !{!"llvm.loop.isvectorized", i32 1} +; VF8UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} ; VF8UF2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} -; VF8UF2: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} -; VF8UF2: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} -; VF8UF2: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} ;. ; VF16UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; VF16UF1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; VF16UF1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; VF16UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; VF16UF1: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"} +; VF16UF1: [[META2]] = !{!"llvm.loop.isvectorized", i32 1} +; VF16UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} ; VF16UF1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} -; VF16UF1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} -; VF16UF1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} -; VF16UF1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} ;. -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK: {{.*}} From cabc59194fcaeefceee8cbd2c87293fdcb7c57ae Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 31 Dec 2024 20:44:53 +0000 Subject: [PATCH 18/25] !fixup fix formatting --- llvm/lib/Transforms/Vectorize/VPlan.cpp | 46 ++++++++++++------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 88282dfbe1918..65a3f04642262 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1025,31 +1025,31 @@ void VPlan::execute(VPTransformState *State) { if (isa(&R)) continue; - if (isa(&R)) { - PHINode *Phi = nullptr; - if (isa(&R)) { - Phi = cast(State->get(R.getVPSingleValue())); - } else { - auto *WidenPhi = cast(&R); - assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) && - "recipe generating only scalars should have been replaced"); - auto *GEP = cast(State->get(WidenPhi)); - Phi = cast(GEP->getPointerOperand()); + if (isa(&R)) { + PHINode *Phi = nullptr; + if (isa(&R)) { + Phi = cast(State->get(R.getVPSingleValue())); + } else { + auto *WidenPhi = cast(&R); + assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) && + "recipe generating only scalars should have been replaced"); + auto *GEP = cast(State->get(WidenPhi)); + Phi = cast(GEP->getPointerOperand()); + } + + Phi->setIncomingBlock(1, VectorLatchBB); + + // Move the last step to the end of the latch block. This ensures + // consistent placement of all induction updates. + Instruction *Inc = cast(Phi->getIncomingValue(1)); + Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode()); + + // Use the steps for the last part as backedge value for the induction. + if (auto *IV = dyn_cast(&R)) + Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand())); + continue; } - Phi->setIncomingBlock(1, VectorLatchBB); - - // Move the last step to the end of the latch block. This ensures - // consistent placement of all induction updates. - Instruction *Inc = cast(Phi->getIncomingValue(1)); - Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode()); - - // Use the steps for the last part as backedge value for the induction. - if (auto *IV = dyn_cast(&R)) - Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand())); - continue; - } - auto *PhiR = cast(&R); bool NeedsScalar = isa(PhiR) || (isa(PhiR) && From 057f2e921543d436d1187ba81afe2e6b6ae3f81b Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 1 Jan 2025 21:55:33 +0000 Subject: [PATCH 19/25] [VPlan] Replace VPBBs with VPIRBBs during skeleton creation (NFC). --- .../Transforms/Vectorize/LoopVectorize.cpp | 20 +++++++++++- llvm/lib/Transforms/Vectorize/VPlan.cpp | 32 +++---------------- 2 files changed, 23 insertions(+), 29 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b03dd471bd971..ee8997ec0fc8d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2601,10 +2601,24 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { AddedSafetyChecks = true; - introduceCheckBlockInVPlan(Plan, MemCheckBlock); + introduceCheckBlockInVPlan(Plan, VectorPHVPBB, MemCheckBlock); return MemCheckBlock; } +/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p +/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must +/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All +/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock. +static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) { + VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB); + for (auto &R : make_early_inc_range(*VPBB)) { + assert(!R.isPhi() && "Tried to move phi recipe to end of block"); + R.moveBefore(*IRVPBB, IRVPBB->end()); + } + + VPBlockUtils::reassociateBlocks(VPBB, IRVPBB); + // VPBB is now dead and will be cleaned up when the plan gets destroyed. +} void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { LoopVectorPreHeader = OrigLoop->getLoopPreheader(); assert(LoopVectorPreHeader && "Invalid loop structure"); @@ -2615,9 +2629,11 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { LoopMiddleBlock = SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, LI, nullptr, Twine(Prefix) + "middle.block"); + replaceVPBBWithIRVPBB(Plan.getMiddleBlock(), LoopMiddleBlock); LoopScalarPreHeader = SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, nullptr, Twine(Prefix) + "scalar.ph"); + replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader); } /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV @@ -7727,6 +7743,7 @@ DenseMap LoopVectorizationPlanner::executePlan( // 1. Set up the skeleton for vectorization, including vector pre-header and // middle block. The vector loop is created during VPlan execution. + VPBasicBlock *VectorPH = cast(BestVPlan.getEntry()->getSingleSuccessor()); State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton( ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs); if (VectorizingEpilogue) @@ -7764,6 +7781,7 @@ DenseMap LoopVectorizationPlanner::executePlan( BestVPlan.prepareToExecute( ILV.getTripCount(), ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State); + replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB); BestVPlan.execute(&State); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 65a3f04642262..8d4f3b9a94522 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -947,21 +947,6 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, } } -/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p -/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must -/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All -/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock. -static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) { - VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB); - for (auto &R : make_early_inc_range(*VPBB)) { - assert(!R.isPhi() && "Tried to move phi recipe to end of block"); - R.moveBefore(*IRVPBB, IRVPBB->end()); - } - - VPBlockUtils::reassociateBlocks(VPBB, IRVPBB); - // VPBB is now dead and will be cleaned up when the plan gets destroyed. -} - /// Generate the code inside the preheader and body of the vectorized loop. /// Assumes a single pre-header basic-block was created for this. Introduce /// additional basic-blocks as needed, and fill them all. @@ -970,25 +955,14 @@ void VPlan::execute(VPTransformState *State) { State->CFG.PrevVPBB = nullptr; State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor(); BasicBlock *VectorPreHeader = State->CFG.PrevBB; - State->Builder.SetInsertPoint(VectorPreHeader->getTerminator()); + //State->Builder.SetInsertPoint(VectorPreHeader->getTerminator()); // Disconnect VectorPreHeader from ExitBB in both the CFG and DT. cast(VectorPreHeader->getTerminator())->setSuccessor(0, nullptr); State->CFG.DTU.applyUpdates( {{DominatorTree::Delete, VectorPreHeader, State->CFG.ExitBB}}); - // Replace regular VPBB's for the vector preheader, middle and scalar - // preheader blocks with VPIRBasicBlocks wrapping their IR blocks. The IR - // blocks are created during skeleton creation, so we can only create the - // VPIRBasicBlocks now during VPlan execution rather than earlier during VPlan - // construction. - BasicBlock *MiddleBB = State->CFG.ExitBB; - BasicBlock *ScalarPh = MiddleBB->getSingleSuccessor(); - replaceVPBBWithIRVPBB(getVectorPreheader(), VectorPreHeader); - replaceVPBBWithIRVPBB(getMiddleBlock(), MiddleBB); - replaceVPBBWithIRVPBB(getScalarPreheader(), ScalarPh); - - LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << State->VF + LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << State->VF << ", UF=" << getUF() << '\n'); setName("Final VPlan"); LLVM_DEBUG(dump()); @@ -996,6 +970,8 @@ void VPlan::execute(VPTransformState *State) { // Disconnect the middle block from its single successor (the scalar loop // header) in both the CFG and DT. The branch will be recreated during VPlan // execution. + BasicBlock *MiddleBB = State->CFG.ExitBB; + BasicBlock *ScalarPh = MiddleBB->getSingleSuccessor(); auto *BrInst = new UnreachableInst(MiddleBB->getContext()); BrInst->insertBefore(MiddleBB->getTerminator()); MiddleBB->getTerminator()->eraseFromParent(); From fe2c3a51b8cfee9d2d0a549b8f0512f02929e74f Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 1 Jan 2025 22:07:44 +0000 Subject: [PATCH 20/25] [VPlan] Track VectorPH during skeleton creation. --- .../lib/Transforms/Vectorize/LoopVectorize.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index c70ebcefabc5c..c05d9dfe0ad0a 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -479,7 +479,9 @@ class InnerLoopVectorizer { AC(AC), ORE(ORE), VF(VecWidth), MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), - PSI(PSI), RTChecks(RTChecks), Plan(Plan) { + PSI(PSI), RTChecks(RTChecks), Plan(Plan), + VectorPHVPBB( + cast(Plan.getEntry()->getSingleSuccessor())) { // Query this against the original loop and save it here because the profile // of the original loop header may change as the transformation happens. OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( @@ -676,6 +678,8 @@ class InnerLoopVectorizer { BasicBlock *AdditionalBypassBlock = nullptr; VPlan &Plan; + + VPBasicBlock *VectorPHVPBB; }; /// Encapsulate information regarding vectorization of a loop and its epilogue. @@ -2446,11 +2450,11 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { /// Introduces a new VPIRBasicBlock for \p CheckIRBB to \p Plan between the /// vector preheader and its predecessor, also connecting the new block to the /// scalar preheader. -static void introduceCheckBlockInVPlan(VPlan &Plan, BasicBlock *CheckIRBB) { +static void introduceCheckBlockInVPlan(VPlan &Plan, VPBlockBase *VectorPH, + BasicBlock *CheckIRBB) { VPBlockBase *ScalarPH = Plan.getScalarPreheader(); // FIXME: Cannot get the vector preheader at the moment if the vector loop // region has been removed. - VPBlockBase *VectorPH = Plan.getVectorPreheader(); VPBlockBase *PreVectorPH = VectorPH->getSinglePredecessor(); if (PreVectorPH->getNumSuccessors() != 1) { assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors"); @@ -2546,7 +2550,7 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { LoopBypassBlocks.push_back(TCCheckBlock); // TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here. - introduceCheckBlockInVPlan(Plan, TCCheckBlock); + introduceCheckBlockInVPlan(Plan, VectorPHVPBB, TCCheckBlock); } BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { @@ -2564,7 +2568,7 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { LoopBypassBlocks.push_back(SCEVCheckBlock); AddedSafetyChecks = true; - introduceCheckBlockInVPlan(Plan, SCEVCheckBlock); + introduceCheckBlockInVPlan(Plan, VectorPHVPBB, SCEVCheckBlock); return SCEVCheckBlock; } @@ -7963,7 +7967,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false); ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); - introduceCheckBlockInVPlan(Plan, TCCheckBlock); + introduceCheckBlockInVPlan(Plan, VectorPHVPBB, TCCheckBlock); return TCCheckBlock; } @@ -8103,7 +8107,7 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( Plan.setEntry(NewEntry); // OldEntry is now dead and will be cleaned up when the plan gets destroyed. - introduceCheckBlockInVPlan(Plan, Insert); + introduceCheckBlockInVPlan(Plan, VectorPHVPBB, Insert); return Insert; } From 26c94b1c25784c788536f0070df7d2d775192f92 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 1 Jan 2025 22:08:39 +0000 Subject: [PATCH 21/25] !fixup restore original getVectorLoopRegion impl --- llvm/lib/Transforms/Vectorize/VPlan.cpp | 25 ++++++------------- llvm/lib/Transforms/Vectorize/VPlan.h | 8 ++++-- .../truncate-to-minimal-bitwidth-cost.ll | 4 +-- .../vector-loop-backedge-elimination.ll | 12 ++++----- 4 files changed, 22 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 8a67d7f4b75cb..6899657a969d5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1044,28 +1044,19 @@ InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) { return getVectorLoopRegion()->cost(VF, Ctx); } -VPBasicBlock *VPlan::getVectorPreheader() { - VPBlockBase *Current = getEntry()->getSuccessors().back(); - while (Current->getNumSuccessors() == 2) - Current = Current->getSuccessors().back(); - return cast(Current); -} - -VPBasicBlock *VPlan::getVectorPreheader() const { - VPBlockBase *Current = getEntry()->getSuccessors().back(); - while (Current->getNumSuccessors() == 2) - Current = Current->getSuccessors().back(); - return cast(Current); -} - VPRegionBlock *VPlan::getVectorLoopRegion() { // TODO: Cache if possible. - return dyn_cast(getVectorPreheader()->getSingleSuccessor()); - ; + for (VPBlockBase *B : vp_depth_first_shallow(getEntry())) + if (auto *R = dyn_cast(B)) + return R->isReplicator() ? nullptr : R; + return nullptr; } const VPRegionBlock *VPlan::getVectorLoopRegion() const { - return dyn_cast(getVectorPreheader()->getSingleSuccessor()); + for (const VPBlockBase *B : vp_depth_first_shallow(getEntry())) + if (auto *R = dyn_cast(B)) + return R->isReplicator() ? nullptr : R; + return nullptr; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index c15018dc49c1c..16024fc69703a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3843,8 +3843,12 @@ class VPlan { const VPBasicBlock *getEntry() const { return Entry; } /// Returns the preheader of the vector loop region. - VPBasicBlock *getVectorPreheader(); - VPBasicBlock *getVectorPreheader() const; + VPBasicBlock *getVectorPreheader() { + VPRegionBlock *VectorRegion = getVectorLoopRegion(); + return VectorRegion + ? cast(VectorRegion->getSinglePredecessor()) + : nullptr; + } /// Returns the VPRegionBlock of the vector loop. VPRegionBlock *getVectorLoopRegion(); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll index 7a6676ff6ea1b..f38aa11b5af87 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll @@ -163,12 +163,12 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 { ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i8> [[BROADCAST_SPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP0:%.*]] = trunc <2 x i8> [[BROADCAST_SPLAT]] to <2 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i1> splat (i1 true), [[TMP0]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT3]], <2 x ptr> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 0, i32 2) ; CHECK-NEXT: [[TMP3:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP2]], <2 x i1> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT1]], <2 x ptr> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> zeroinitializer, <2 x ptr> [[BROADCAST_SPLAT4]], i32 1, <2 x i1> [[TMP3]]) ; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; CHECK: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll index 3de42080b1842..fe5811e7e1159 100644 --- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll +++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll @@ -165,10 +165,10 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, ; VF8UF1-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; VF8UF1-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1 ; VF8UF1-NEXT: [[TMP1:%.*]] = add i64 2, [[N_VEC]] -; VF8UF1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; VF8UF1-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer ; VF8UF1-NEXT: br label %[[VECTOR_BODY:.*]] ; VF8UF1: [[VECTOR_BODY]]: +; VF8UF1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; VF8UF1-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer ; VF8UF1-NEXT: [[TMP2:%.*]] = icmp ule <8 x i64> , [[BROADCAST_SPLAT1]] ; VF8UF1-NEXT: [[TMP3:%.*]] = extractelement <8 x i1> [[TMP2]], i32 0 ; VF8UF1-NEXT: br i1 [[TMP3]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] @@ -253,10 +253,10 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, ; VF8UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; VF8UF2-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1 ; VF8UF2-NEXT: [[TMP1:%.*]] = add i64 2, [[N_VEC]] -; VF8UF2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; VF8UF2-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer ; VF8UF2-NEXT: br label %[[VECTOR_BODY:.*]] ; VF8UF2: [[VECTOR_BODY]]: +; VF8UF2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; VF8UF2-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer ; VF8UF2-NEXT: [[TMP2:%.*]] = icmp ule <8 x i64> , [[BROADCAST_SPLAT1]] ; VF8UF2-NEXT: [[TMP3:%.*]] = icmp ule <8 x i64> , [[BROADCAST_SPLAT1]] ; VF8UF2-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP2]], i32 0 @@ -398,10 +398,10 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, ; VF16UF1-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; VF16UF1-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1 ; VF16UF1-NEXT: [[TMP1:%.*]] = add i64 2, [[N_VEC]] -; VF16UF1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; VF16UF1-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer ; VF16UF1-NEXT: br label %[[VECTOR_BODY:.*]] ; VF16UF1: [[VECTOR_BODY]]: +; VF16UF1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; VF16UF1-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer ; VF16UF1-NEXT: [[TMP2:%.*]] = icmp ule <16 x i64> , [[BROADCAST_SPLAT1]] ; VF16UF1-NEXT: [[TMP3:%.*]] = extractelement <16 x i1> [[TMP2]], i32 0 ; VF16UF1-NEXT: br i1 [[TMP3]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] From f180edb07a037e812ccf49bb42dc4ae8281dd74c Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 3 Jan 2025 20:39:26 +0000 Subject: [PATCH 22/25] !fixup address latest comments, thanks --- .../Transforms/Vectorize/LoopVectorize.cpp | 21 ++-- llvm/lib/Transforms/Vectorize/VPlan.cpp | 104 ++++++++++-------- llvm/lib/Transforms/Vectorize/VPlan.h | 5 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 4 +- .../Transforms/Vectorize/VPlanTransforms.cpp | 61 +++++----- 5 files changed, 103 insertions(+), 92 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 4c42dc2b3a410..e7baa3e556db8 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3012,6 +3012,8 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { getOrCreateVectorTripCount(nullptr), LoopMiddleBlock, State); } + // Don't apply optimizations below when no vector region remains, as they all + // require a vector loop at the moment. if (!State.Plan->getVectorLoopRegion()) return; @@ -7811,14 +7813,14 @@ DenseMap LoopVectorizationPlanner::executePlan( // 2.6. Maintain Loop Hints // Keep all loop hints from the original loop on the vector loop (we'll // replace the vectorizer-specific hints below). - MDNode *OrigLoopID = OrigLoop->getLoopID(); + if (auto *LoopRegion = BestVPlan.getVectorLoopRegion()) { + MDNode *OrigLoopID = OrigLoop->getLoopID(); - std::optional VectorizedLoopID = - makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, - LLVMLoopVectorizeFollowupVectorized}); + std::optional VectorizedLoopID = + makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, + LLVMLoopVectorizeFollowupVectorized}); - if (auto *R = BestVPlan.getVectorLoopRegion()) { - VPBasicBlock *HeaderVPBB = R->getEntryBasicBlock(); + VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock(); Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); if (VectorizedLoopID) { L->setLoopID(*VectorizedLoopID); @@ -7844,11 +7846,10 @@ DenseMap LoopVectorizationPlanner::executePlan( ILV.printDebugTracesAtEnd(); // 4. Adjust branch weight of the branch in the middle block. - if (auto *R = BestVPlan.getVectorLoopRegion()) { - auto *ExitVPBB = cast(R->getSingleSuccessor()); - + if (BestVPlan.getVectorLoopRegion()) { + auto *MiddleVPBB = BestVPlan.getMiddleBlock(); auto *MiddleTerm = - cast(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator()); + cast(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator()); if (MiddleTerm->isConditional() && hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { // Assume that `Count % VectorTripCount` is equally distributed. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index ac6180379eded..62b4490a52845 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -554,6 +554,8 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) { template static T *getEnclosingLoopRegionForRegion(T *P) { if (P && P->isReplicator()) { P = P->getParent(); + // Multiple loop regions can be nested, but replicate regions can only be + // nested inside a loop region or must be outside any other region. assert((!P || !cast(P)->isReplicator()) && "unexpected nested replicate regions"); } @@ -933,6 +935,8 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); // FIXME: Model VF * UF computation completely in VPlan. + assert((!getVectorLoopRegion() || VFxUF.getNumUsers()) && + "VFxUF expected to always have users"); unsigned UF = getUF(); if (VF.getNumUsers()) { Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF); @@ -986,54 +990,56 @@ void VPlan::execute(VPTransformState *State) { for (VPBlockBase *Block : RPOT) Block->execute(State); - if (auto *LoopRegion = getVectorLoopRegion()) { - VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock(); - BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB]; + State->CFG.DTU.flush(); - // Fix the latch value of canonical, reduction and first-order recurrences - // phis in the vector loop. - VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); - for (VPRecipeBase &R : Header->phis()) { - // Skip phi-like recipes that generate their backedege values themselves. - if (isa(&R)) - continue; + auto *LoopRegion = getVectorLoopRegion(); + if (!LoopRegion) + return; - if (isa(&R)) { - PHINode *Phi = nullptr; - if (isa(&R)) { - Phi = cast(State->get(R.getVPSingleValue())); - } else { - auto *WidenPhi = cast(&R); - assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) && - "recipe generating only scalars should have been replaced"); - auto *GEP = cast(State->get(WidenPhi)); - Phi = cast(GEP->getPointerOperand()); - } - - Phi->setIncomingBlock(1, VectorLatchBB); - - // Move the last step to the end of the latch block. This ensures - // consistent placement of all induction updates. - Instruction *Inc = cast(Phi->getIncomingValue(1)); - Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode()); - - // Use the steps for the last part as backedge value for the induction. - if (auto *IV = dyn_cast(&R)) - Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand())); - continue; + VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock(); + BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB]; + + // Fix the latch value of canonical, reduction and first-order recurrences + // phis in the vector loop. + VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); + for (VPRecipeBase &R : Header->phis()) { + // Skip phi-like recipes that generate their backedege values themselves. + if (isa(&R)) + continue; + + if (isa(&R)) { + PHINode *Phi = nullptr; + if (isa(&R)) { + Phi = cast(State->get(R.getVPSingleValue())); + } else { + auto *WidenPhi = cast(&R); + assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) && + "recipe generating only scalars should have been replaced"); + auto *GEP = cast(State->get(WidenPhi)); + Phi = cast(GEP->getPointerOperand()); } - auto *PhiR = cast(&R); - bool NeedsScalar = isa(PhiR) || - (isa(PhiR) && - cast(PhiR)->isInLoop()); - Value *Phi = State->get(PhiR, NeedsScalar); - Value *Val = State->get(PhiR->getBackedgeValue(), NeedsScalar); - cast(Phi)->addIncoming(Val, VectorLatchBB); + Phi->setIncomingBlock(1, VectorLatchBB); + + // Move the last step to the end of the latch block. This ensures + // consistent placement of all induction updates. + Instruction *Inc = cast(Phi->getIncomingValue(1)); + Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode()); + + // Use the steps for the last part as backedge value for the induction. + if (auto *IV = dyn_cast(&R)) + Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand())); + continue; } - } - State->CFG.DTU.flush(); + auto *PhiR = cast(&R); + bool NeedsScalar = isa(PhiR) || + (isa(PhiR) && + cast(PhiR)->isInLoop()); + Value *Phi = State->get(PhiR, NeedsScalar); + Value *Val = State->get(PhiR->getBackedgeValue(), NeedsScalar); + cast(Phi)->addIncoming(Val, VectorLatchBB); + } } InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) { @@ -1399,13 +1405,17 @@ void VPlanIngredient::print(raw_ostream &O) const { #endif -bool VPValue::isDefinedOutsideLoopRegions() const { - - return !hasDefiningRecipe() || - (!getDefiningRecipe()->getParent()->getEnclosingLoopRegion() && - getDefiningRecipe()->getParent()->getPlan()->getVectorLoopRegion()); +/// Returns true if there is a vector loop region and \p VPV is defined in a +/// loop region. +static bool isDefinedInsideLoopRegions(const VPValue *VPV) { + const VPRecipeBase *DefR = VPV->getDefiningRecipe(); + return DefR && (!DefR->getParent()->getPlan()->getVectorLoopRegion() || + DefR->getParent()->getEnclosingLoopRegion()); } +bool VPValue::isDefinedOutsideLoopRegions() const { + return !isDefinedInsideLoopRegions(this); +} void VPValue::replaceAllUsesWith(VPValue *New) { replaceUsesWithIf(New, [](VPUser &, unsigned) { return true; }); } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 16024fc69703a..27ae4dff25e3e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3809,6 +3809,8 @@ class VPlan { ~VPlan(); + /// Set \p EntryBlock as the entry VPBlockBase of this VPRegionBlock. \p + /// EntryBlock must have no predecessors. void setEntry(VPBasicBlock *VPBB) { Entry = VPBB; VPBB->setPlan(this); @@ -3842,7 +3844,8 @@ class VPlan { VPBasicBlock *getEntry() { return Entry; } const VPBasicBlock *getEntry() const { return Entry; } - /// Returns the preheader of the vector loop region. + /// Returns the preheader of the vector loop region, if one exists, or null + /// otherwise. VPBasicBlock *getVectorPreheader() { VPRegionBlock *VectorRegion = getVectorLoopRegion(); return VectorRegion diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 8b424ed055fef..77c08839dbfa9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2376,9 +2376,7 @@ void VPBranchOnMaskRecipe::execute(VPTransformState &State) { // Replace the temporary unreachable terminator with a new conditional branch, // whose two destinations will be set later when they are created. auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); - assert((isa(CurrentTerminator) || - (isa(CurrentTerminator) && - !CurrentTerminator->getOperand(0))) && + assert(isa(CurrentTerminator) && "Expected to replace unreachable terminator with conditional branch."); auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); CondBr->setSuccessor(0, nullptr); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index c59bd874487c8..84a2146977f8b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -794,15 +794,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return R.getVPSingleValue()->replaceAllUsesWith(R.getOperand(1)); } -/// Try to simplify the recipes in \p Plan. If \p CanonicalIVTy is not nullptr, -/// use it directly instead of retrieving the canonical IV type from the plan -/// which may not exist any longer. -static void simplifyRecipes(VPlan &Plan, Type *CanonicalIVTy = nullptr) { +/// Try to simplify the recipes in \p Plan. Use \p CanonicalIVTy as type for all +/// un-typed live-ins in VPTypeAnalysis. +static void simplifyRecipes(VPlan &Plan, Type *CanonicalIVTy) { ReversePostOrderTraversal> RPOT( Plan.getEntry()); - Type *CanonicalIVType = - CanonicalIVTy ? CanonicalIVTy : Plan.getCanonicalIV()->getScalarType(); - VPTypeAnalysis TypeInfo(CanonicalIVType); + VPTypeAnalysis TypeInfo(CanonicalIVTy); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { simplifyRecipe(R, TypeInfo); @@ -840,38 +837,40 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C)) return; + // The vector loop region only executes once. If possible, completely remove + // the region, otherwise replace the terminator controlling the latch with + // (BranchOnCond true). Term->eraseFromParent(); auto *Header = cast(VectorRegion->getEntry()); auto *CanIVTy = Plan.getCanonicalIV()->getScalarType(); - if (any_of(Header->phis(), - IsaPred)) { - LLVMContext &Ctx = SE.getContext(); - auto *BOC = new VPInstruction( - VPInstruction::BranchOnCond, - {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}, Term->getDebugLoc()); - ExitingVPBB->appendRecipe(BOC); - } else { - for (VPRecipeBase &R : make_early_inc_range(Header->phis())) { - auto *P = cast(&R); - P->replaceAllUsesWith(P->getStartValue()); - P->eraseFromParent(); + if (all_of( + Header->phis(), + IsaPred)) { + for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) { + auto *HeaderPhiR = cast(&HeaderR); + HeaderPhiR->replaceAllUsesWith(HeaderPhiR->getStartValue()); + HeaderPhiR->eraseFromParent(); } - VPBlockBase *Preheader = Plan.getVectorPreheader(); - VPBlockBase *Middle = Plan.getMiddleBlock(); + VPBlockBase *Preheader = VectorRegion->getSinglePredecessor(); + VPBlockBase *Exit = VectorRegion->getSingleSuccessor(); VPBlockUtils::disconnectBlocks(Preheader, VectorRegion); - VPBlockUtils::disconnectBlocks(VectorRegion, Middle); + VPBlockUtils::disconnectBlocks(VectorRegion, Exit); - Header->setParent(nullptr); - ExitingVPBB->setParent(nullptr); + for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry())) + B->setParent(nullptr); - for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry())) { - if (isa(B)) - B->setParent(nullptr); - } VPBlockUtils::connectBlocks(Preheader, Header); - VPBlockUtils::connectBlocks(ExitingVPBB, Middle); + VPBlockUtils::connectBlocks(ExitingVPBB, Exit); simplifyRecipes(Plan, CanIVTy); + } else { + // The vector region contains header phis for which we cannot remove the + // loop region yet. + LLVMContext &Ctx = SE.getContext(); + auto *BOC = new VPInstruction( + VPInstruction::BranchOnCond, + {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}, Term->getDebugLoc()); + ExitingVPBB->appendRecipe(BOC); } VPlanTransforms::removeDeadRecipes(Plan); @@ -1287,10 +1286,10 @@ void VPlanTransforms::optimize(VPlan &Plan) { removeRedundantCanonicalIVs(Plan); removeRedundantInductionCasts(Plan); - simplifyRecipes(Plan); + simplifyRecipes(Plan, Plan.getCanonicalIV()->getScalarType()); legalizeAndOptimizeInductions(Plan); removeRedundantExpandSCEVRecipes(Plan); - simplifyRecipes(Plan); + simplifyRecipes(Plan, Plan.getCanonicalIV()->getScalarType()); removeDeadRecipes(Plan); createAndOptimizeReplicateRegions(Plan); From 71ff80a8c69eea1b84ba63a9e7b588410161997f Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 3 Jan 2025 20:45:00 +0000 Subject: [PATCH 23/25] !fixup fix formatting --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e7baa3e556db8..b1aa1d198de2d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7749,7 +7749,8 @@ DenseMap LoopVectorizationPlanner::executePlan( // 1. Set up the skeleton for vectorization, including vector pre-header and // middle block. The vector loop is created during VPlan execution. - VPBasicBlock *VectorPH = cast(BestVPlan.getEntry()->getSingleSuccessor()); + VPBasicBlock *VectorPH = + cast(BestVPlan.getEntry()->getSingleSuccessor()); State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton( ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs); if (VectorizingEpilogue) From 4f748274b9250ad4f6c85a40af3727b9adea283f Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 4 Jan 2025 13:59:06 +0000 Subject: [PATCH 24/25] !fixup restore newline and move comment --- llvm/lib/Transforms/Vectorize/VPlan.cpp | 1 + llvm/lib/Transforms/Vectorize/VPlan.h | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 62b4490a52845..e804f81c36dba 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -447,6 +447,7 @@ void VPBasicBlock::connectToPredecessors(VPTransformState::CFGState &CFG) { CFG.DTU.applyUpdates({{DominatorTree::Insert, PredBB, NewBB}}); } } + void VPIRBasicBlock::execute(VPTransformState *State) { assert(getHierarchicalSuccessors().size() <= 2 && "VPIRBasicBlock can have at most two successors at the moment!"); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 27ae4dff25e3e..3e8021e2625c7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3668,6 +3668,8 @@ class VPRegionBlock : public VPBlockBase { const VPBlockBase *getEntry() const { return Entry; } VPBlockBase *getEntry() { return Entry; } + /// Set \p EntryBlock as the entry VPBlockBase of this VPRegionBlock. \p + /// EntryBlock must have no predecessors. void setEntry(VPBlockBase *EntryBlock) { assert(EntryBlock->getPredecessors().empty() && "Entry block cannot have predecessors."); @@ -3809,8 +3811,6 @@ class VPlan { ~VPlan(); - /// Set \p EntryBlock as the entry VPBlockBase of this VPRegionBlock. \p - /// EntryBlock must have no predecessors. void setEntry(VPBasicBlock *VPBB) { Entry = VPBB; VPBB->setPlan(this); From d17571ddb8fc1fd216f668d3b936d0f63857b8f4 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 4 Jan 2025 20:10:53 +0000 Subject: [PATCH 25/25] !fixup move erase of term just before removing dead recipes. --- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 84a2146977f8b..c8f8b44e76dd6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -840,7 +840,6 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, // The vector loop region only executes once. If possible, completely remove // the region, otherwise replace the terminator controlling the latch with // (BranchOnCond true). - Term->eraseFromParent(); auto *Header = cast(VectorRegion->getEntry()); auto *CanIVTy = Plan.getCanonicalIV()->getScalarType(); if (all_of( @@ -873,6 +872,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, ExitingVPBB->appendRecipe(BOC); } + Term->eraseFromParent(); VPlanTransforms::removeDeadRecipes(Plan); Plan.setVF(BestVF);