Skip to content

Commit 3808ba7

Browse files
authored
[VPlan] Model middle block via VPIRBasicBlock. (#95816)
Use VPIRBasicBlock to wrap the middle block and implement patching up branches in predecessors in VPIRBasicBlock::execute. The IR middle block is only created after skeleton creation. Initially a regular VPBasicBlock is created, which will later be replaced by a VPIRBasicBlock once the middle IR basic block has been created. Note that this slightly changes the order of instructions created in the middle block; code generated by recipe execution in the middle block will now be inserted before the terminator (and in between the compare to used by the terminator). The original order will be restored in #92651. PR: #95816
1 parent 41c6e43 commit 3808ba7

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

53 files changed

+218
-169
lines changed

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 42 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -448,13 +448,29 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
448448
}
449449

450450
void VPIRBasicBlock::execute(VPTransformState *State) {
451-
assert(getHierarchicalPredecessors().empty() &&
452-
"VPIRBasicBlock cannot have predecessors at the moment");
453451
assert(getHierarchicalSuccessors().empty() &&
454452
"VPIRBasicBlock cannot have successors at the moment");
455453

456454
State->Builder.SetInsertPoint(getIRBasicBlock()->getTerminator());
457455
executeRecipes(State, getIRBasicBlock());
456+
457+
for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) {
458+
VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock();
459+
BasicBlock *PredBB = State->CFG.VPBB2IRBB[PredVPBB];
460+
assert(PredBB && "Predecessor basic-block not found building successor.");
461+
LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n');
462+
463+
auto *PredBBTerminator = PredBB->getTerminator();
464+
auto *TermBr = cast<BranchInst>(PredBBTerminator);
465+
// Set each forward successor here when it is created, excluding
466+
// backedges. A backward successor is set when the branch is created.
467+
const auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
468+
unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
469+
assert(!TermBr->getSuccessor(idx) &&
470+
"Trying to reset an existing successor block.");
471+
TermBr->setSuccessor(idx, IRBB);
472+
State->CFG.DTU.applyUpdates({{DominatorTree::Insert, PredBB, IRBB}});
473+
}
458474
}
459475

460476
void VPBasicBlock::execute(VPTransformState *State) {
@@ -468,30 +484,14 @@ void VPBasicBlock::execute(VPTransformState *State) {
468484
return R && !R->isReplicator();
469485
};
470486

471-
// 1. Create an IR basic block, or reuse the last one or ExitBB if possible.
472-
if (getPlan()->getVectorLoopRegion()->getSingleSuccessor() == this) {
473-
// ExitBB can be re-used for the exit block of the Plan.
474-
NewBB = State->CFG.ExitBB;
475-
State->CFG.PrevBB = NewBB;
476-
State->Builder.SetInsertPoint(NewBB->getFirstNonPHI());
477-
478-
// Update the branch instruction in the predecessor to branch to ExitBB.
479-
VPBlockBase *PredVPB = getSingleHierarchicalPredecessor();
480-
VPBasicBlock *ExitingVPBB = PredVPB->getExitingBasicBlock();
481-
assert(PredVPB->getSingleSuccessor() == this &&
482-
"predecessor must have the current block as only successor");
483-
BasicBlock *ExitingBB = State->CFG.VPBB2IRBB[ExitingVPBB];
484-
// The Exit block of a loop is always set to be successor 0 of the Exiting
485-
// block.
486-
cast<BranchInst>(ExitingBB->getTerminator())->setSuccessor(0, NewBB);
487-
State->CFG.DTU.applyUpdates({{DominatorTree::Insert, ExitingBB, NewBB}});
488-
} else if (PrevVPBB && /* A */
489-
!((SingleHPred = getSingleHierarchicalPredecessor()) &&
490-
SingleHPred->getExitingBasicBlock() == PrevVPBB &&
491-
PrevVPBB->getSingleHierarchicalSuccessor() &&
492-
(SingleHPred->getParent() == getEnclosingLoopRegion() &&
493-
!IsLoopRegion(SingleHPred))) && /* B */
494-
!(Replica && getPredecessors().empty())) { /* C */
487+
// 1. Create an IR basic block.
488+
if (PrevVPBB && /* A */
489+
!((SingleHPred = getSingleHierarchicalPredecessor()) &&
490+
SingleHPred->getExitingBasicBlock() == PrevVPBB &&
491+
PrevVPBB->getSingleHierarchicalSuccessor() &&
492+
(SingleHPred->getParent() == getEnclosingLoopRegion() &&
493+
!IsLoopRegion(SingleHPred))) && /* B */
494+
!(Replica && getPredecessors().empty())) { /* C */
495495
// The last IR basic block is reused, as an optimization, in three cases:
496496
// A. the first VPBB reuses the loop pre-header BB - when PrevVPBB is null;
497497
// B. when the current VPBB has a single (hierarchical) predecessor which
@@ -842,6 +842,19 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
842842
}
843843
}
844844

845+
/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
846+
/// VPBB are moved to the newly created VPIRBasicBlock.
847+
static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
848+
assert(VPBB->getNumSuccessors() == 0 && "VPBB must be a leave node");
849+
VPIRBasicBlock *IRMiddleVPBB = new VPIRBasicBlock(IRBB);
850+
for (auto &R : make_early_inc_range(*VPBB))
851+
R.moveBefore(*IRMiddleVPBB, IRMiddleVPBB->end());
852+
VPBlockBase *PredVPBB = VPBB->getSinglePredecessor();
853+
VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
854+
VPBlockUtils::connectBlocks(PredVPBB, IRMiddleVPBB);
855+
delete VPBB;
856+
}
857+
845858
/// Generate the code inside the preheader and body of the vectorized loop.
846859
/// Assumes a single pre-header basic-block was created for this. Introduce
847860
/// additional basic-blocks as needed, and fill them all.
@@ -851,6 +864,9 @@ void VPlan::execute(VPTransformState *State) {
851864
State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor();
852865
BasicBlock *VectorPreHeader = State->CFG.PrevBB;
853866
State->Builder.SetInsertPoint(VectorPreHeader->getTerminator());
867+
replaceVPBBWithIRVPBB(
868+
cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor()),
869+
State->CFG.ExitBB);
854870

855871
// Disconnect VectorPreHeader from ExitBB in both the CFG and DT.
856872
cast<BranchInst>(VectorPreHeader->getTerminator())->setSuccessor(0, nullptr);

llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@ define void @firstorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
4747
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
4848
; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
4949
; CHECK: middle.block:
50-
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i8> [[WIDE_LOAD1]], i32 15
5150
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
51+
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i8> [[WIDE_LOAD1]], i32 15
5252
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
5353
; CHECK: scalar.ph:
5454
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -154,10 +154,10 @@ define void @thirdorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
154154
; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
155155
; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
156156
; CHECK: middle.block:
157+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
157158
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i8> [[WIDE_LOAD5]], i32 15
158159
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement <16 x i8> [[TMP8]], i32 15
159160
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT9:%.*]] = extractelement <16 x i8> [[TMP10]], i32 15
160-
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
161161
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
162162
; CHECK: scalar.ph:
163163
; CHECK-NEXT: [[SCALAR_RECUR_INIT10:%.*]] = phi i8 [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT9]], [[MIDDLE_BLOCK]] ]

llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,9 +124,9 @@ define i64 @pointer_induction_only(ptr %start, ptr %end) {
124124
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
125125
; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
126126
; CHECK: middle.block:
127+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
127128
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
128129
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
129-
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
130130
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
131131
; CHECK: scalar.ph:
132132
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]

llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,11 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) {
4747
; INTERLEAVE-4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
4848
; INTERLEAVE-4-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
4949
; INTERLEAVE-4: middle.block:
50+
; INTERLEAVE-4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
5051
; INTERLEAVE-4-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP13]], [[TMP12]]
5152
; INTERLEAVE-4-NEXT: [[BIN_RDX7:%.*]] = add <4 x i32> [[TMP14]], [[BIN_RDX]]
5253
; INTERLEAVE-4-NEXT: [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP15]], [[BIN_RDX7]]
5354
; INTERLEAVE-4-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX8]])
54-
; INTERLEAVE-4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
5555
; INTERLEAVE-4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
5656
; INTERLEAVE-4: scalar.ph:
5757
; INTERLEAVE-4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
@@ -96,9 +96,9 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) {
9696
; INTERLEAVE-2-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
9797
; INTERLEAVE-2-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
9898
; INTERLEAVE-2: middle.block:
99+
; INTERLEAVE-2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
99100
; INTERLEAVE-2-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP7]], [[TMP6]]
100101
; INTERLEAVE-2-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
101-
; INTERLEAVE-2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
102102
; INTERLEAVE-2-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
103103
; INTERLEAVE-2: scalar.ph:
104104
; INTERLEAVE-2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]

llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -785,8 +785,8 @@ define void @add_phifail(ptr noalias nocapture readonly %p, ptr noalias nocaptur
785785
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
786786
; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
787787
; CHECK: middle.block:
788-
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP4]], i32 15
789788
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
789+
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP4]], i32 15
790790
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
791791
; CHECK: scalar.ph:
792792
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -868,9 +868,9 @@ define i8 @add_phifail2(ptr noalias nocapture readonly %p, ptr noalias nocapture
868868
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
869869
; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
870870
; CHECK: middle.block:
871+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
871872
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <16 x i32> [[TMP6]], i32 14
872873
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
873-
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
874874
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
875875
; CHECK: scalar.ph:
876876
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]

llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,13 +98,13 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
9898
; DEFAULT-NEXT: [[TMP59:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
9999
; DEFAULT-NEXT: br i1 [[TMP59]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
100100
; DEFAULT: middle.block:
101+
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
101102
; DEFAULT-NEXT: [[BIN_RDX:%.*]] = or <vscale x 4 x i32> [[TMP58]], [[TMP57]]
102103
; DEFAULT-NEXT: [[TMP60:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
103104
; DEFAULT-NEXT: [[TMP64:%.*]] = call i32 @llvm.vscale.i32()
104105
; DEFAULT-NEXT: [[TMP65:%.*]] = mul i32 [[TMP64]], 4
105106
; DEFAULT-NEXT: [[TMP66:%.*]] = sub i32 [[TMP65]], 1
106107
; DEFAULT-NEXT: [[VECTOR_RECUR_EXTRACT13:%.*]] = extractelement <vscale x 4 x i32> [[TMP20]], i32 [[TMP66]]
107-
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
108108
; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
109109
; DEFAULT: scalar.ph:
110110
; DEFAULT-NEXT: [[SCALAR_RECUR_INIT14:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT13]], [[MIDDLE_BLOCK]] ]
@@ -351,9 +351,9 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 {
351351
; DEFAULT-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
352352
; DEFAULT-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
353353
; DEFAULT: middle.block:
354+
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
354355
; DEFAULT-NEXT: [[BIN_RDX:%.*]] = or <vscale x 4 x i16> [[TMP22]], [[TMP21]]
355356
; DEFAULT-NEXT: [[TMP24:%.*]] = call i16 @llvm.vector.reduce.or.nxv4i16(<vscale x 4 x i16> [[BIN_RDX]])
356-
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
357357
; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
358358
; DEFAULT: scalar.ph:
359359
; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]

0 commit comments

Comments
 (0)