Skip to content

Commit 37acee7

Browse files
AMDGPU: fix isSafeToSink expecting exactly one predecessor #89224
isSafeToSink needs to check if machine cycle has divergent exit branch but first it needs the MBB that contains cycle exit branch. Early-tailduplication can delete exit block created by structurize-cfg so there is still exactly one cycle exit block but the new cycle exit block can have multiple predecessors. Simplify search for MBBs that contain cycle exit branch by introducing helper method getExitingBlocks in GenericCycle.
1 parent 673cfcd commit 37acee7

File tree

4 files changed

+117
-7
lines changed

4 files changed

+117
-7
lines changed

llvm/include/llvm/ADT/GenericCycleImpl.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,21 @@ void GenericCycle<ContextT>::getExitBlocks(
6666
}
6767
}
6868

69+
template <typename ContextT>
70+
void GenericCycle<ContextT>::getExitingBlocks(
71+
SmallVectorImpl<BlockT *> &TmpStorage) const {
72+
TmpStorage.clear();
73+
74+
for (BlockT *Block : blocks()) {
75+
for (BlockT *Succ : successors(Block)) {
76+
if (!contains(Succ)) {
77+
TmpStorage.push_back(Block);
78+
break;
79+
}
80+
}
81+
}
82+
}
83+
6984
template <typename ContextT>
7085
auto GenericCycle<ContextT>::getCyclePreheader() const -> BlockT * {
7186
BlockT *Predecessor = getCyclePredecessor();

llvm/include/llvm/ADT/GenericCycleInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,10 @@ template <typename ContextT> class GenericCycle {
126126
/// branched to.
127127
void getExitBlocks(SmallVectorImpl<BlockT *> &TmpStorage) const;
128128

129+
/// Return all blocks of this cycle that have successor outside of this cycle.
130+
/// These blocks have cycle exit branch.
131+
void getExitingBlocks(SmallVectorImpl<BlockT *> &TmpStorage) const;
132+
129133
/// Return the preheader block for this cycle. Pre-header is well-defined for
130134
/// reducible cycle in docs/LoopTerminology.rst as: the only one entering
131135
/// block and its only edge is to the entry block. Return null for irreducible

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -213,15 +213,13 @@ bool SIInstrInfo::isSafeToSink(MachineInstr &MI,
213213
// Check if there is a FromCycle that contains SgprDef's basic block but
214214
// does not contain SuccToSinkTo and also has divergent exit condition.
215215
while (FromCycle && !FromCycle->contains(ToCycle)) {
216-
// After structurize-cfg, there should be exactly one cycle exit.
217-
SmallVector<MachineBasicBlock *, 1> ExitBlocks;
218-
FromCycle->getExitBlocks(ExitBlocks);
219-
assert(ExitBlocks.size() == 1);
220-
assert(ExitBlocks[0]->getSinglePredecessor());
216+
SmallVector<MachineBasicBlock *, 1> ExitingBlocks;
217+
FromCycle->getExitingBlocks(ExitingBlocks);
221218

222219
// FromCycle has divergent exit condition.
223-
if (hasDivergentBranch(ExitBlocks[0]->getSinglePredecessor())) {
224-
return false;
220+
for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
221+
if (hasDivergentBranch(ExitingBlock))
222+
return false;
225223
}
226224

227225
FromCycle = FromCycle->getParentCycle();
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel -verify-machineinstrs < %s | FileCheck %s
3+
4+
; early-tailduplication deletes cycle exit block created by structurize-cfg
5+
; that had exactly one predecessor. Now, new cycle exit block has two
6+
; predecessors, we need to find predecessor that belongs to the cycle.
7+
8+
define amdgpu_ps void @_amdgpu_ps_main(i1 %arg) {
9+
; CHECK-LABEL: _amdgpu_ps_main:
10+
; CHECK: ; %bb.0: ; %bb
11+
; CHECK-NEXT: s_mov_b32 s4, 0
12+
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
13+
; CHECK-NEXT: s_mov_b32 s5, s4
14+
; CHECK-NEXT: s_mov_b32 s6, s4
15+
; CHECK-NEXT: s_mov_b32 s7, s4
16+
; CHECK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
17+
; CHECK-NEXT: s_buffer_load_dword s1, s[4:7], 0x0
18+
; CHECK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
19+
; CHECK-NEXT: s_mov_b32 s10, -1
20+
; CHECK-NEXT: s_mov_b32 s11, 0x31c16000
21+
; CHECK-NEXT: s_add_u32 s8, s8, s0
22+
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
23+
; CHECK-NEXT: s_addc_u32 s9, s9, 0
24+
; CHECK-NEXT: s_mov_b32 s32, 0
25+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
26+
; CHECK-NEXT: s_cmp_ge_i32 s1, 0
27+
; CHECK-NEXT: s_cbranch_scc0 .LBB0_2
28+
; CHECK-NEXT: .LBB0_1: ; %bb12
29+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s4
30+
; CHECK-NEXT: v_mov_b32_e32 v1, 0
31+
; CHECK-NEXT: v_mov_b32_e32 v2, 0
32+
; CHECK-NEXT: v_mov_b32_e32 v3, 0
33+
; CHECK-NEXT: v_mov_b32_e32 v4, 0
34+
; CHECK-NEXT: s_mov_b64 s[0:1], s[8:9]
35+
; CHECK-NEXT: s_mov_b64 s[2:3], s[10:11]
36+
; CHECK-NEXT: s_swappc_b64 s[30:31], 0
37+
; CHECK-NEXT: .LBB0_2: ; %bb2.preheader
38+
; CHECK-NEXT: s_mov_b32 s1, 0
39+
; CHECK-NEXT: v_mov_b32_e32 v0, s1
40+
; CHECK-NEXT: s_branch .LBB0_4
41+
; CHECK-NEXT: .p2align 6
42+
; CHECK-NEXT: .LBB0_3: ; %bb6
43+
; CHECK-NEXT: ; in Loop: Header=BB0_4 Depth=1
44+
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s3
45+
; CHECK-NEXT: s_and_b32 s2, 1, s2
46+
; CHECK-NEXT: v_or_b32_e32 v1, 1, v0
47+
; CHECK-NEXT: v_cmp_ne_u32_e64 s2, 0, s2
48+
; CHECK-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v0
49+
; CHECK-NEXT: v_mov_b32_e32 v0, v1
50+
; CHECK-NEXT: s_and_b32 s4, s2, s1
51+
; CHECK-NEXT: s_andn2_b32 s1, s1, exec_lo
52+
; CHECK-NEXT: s_and_b32 s2, exec_lo, s4
53+
; CHECK-NEXT: s_or_b32 s1, s1, s2
54+
; CHECK-NEXT: s_cbranch_vccz .LBB0_1
55+
; CHECK-NEXT: .LBB0_4: ; %bb2
56+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
57+
; CHECK-NEXT: s_mov_b32 s2, 0
58+
; CHECK-NEXT: s_and_saveexec_b32 s3, s0
59+
; CHECK-NEXT: s_cbranch_execz .LBB0_3
60+
; CHECK-NEXT: ; %bb.5: ; %bb5
61+
; CHECK-NEXT: ; in Loop: Header=BB0_4 Depth=1
62+
; CHECK-NEXT: s_mov_b32 s2, 1
63+
; CHECK-NEXT: s_branch .LBB0_3
64+
bb:
65+
%i = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> zeroinitializer, i32 0, i32 0)
66+
%i1 = icmp slt i32 %i, 0
67+
br i1 %i1, label %bb2, label %bb12
68+
69+
bb2:
70+
%i3 = phi i1 [ %i9, %bb6 ], [ false, %bb ]
71+
%i4 = phi i32 [ %i10, %bb6 ], [ 0, %bb ]
72+
br i1 %arg, label %bb5, label %bb6
73+
74+
bb5:
75+
br label %bb6
76+
77+
bb6:
78+
%i7 = phi i32 [ 0, %bb2 ], [ 1, %bb5 ]
79+
%i8 = icmp ne i32 %i7, 0
80+
%i9 = select i1 %i8, i1 %i3, i1 false
81+
%i10 = or i32 %i4, 1
82+
%i11 = icmp slt i32 %i4, 0
83+
br i1 %i11, label %bb2, label %bb12
84+
85+
bb12:
86+
%i13 = phi i1 [ false, %bb ], [ %i9, %bb6 ]
87+
%i14 = select i1 %i13, float 0.000000e+00, float 1.000000e+00
88+
%i15 = insertelement <4 x float> zeroinitializer, float %i14, i64 0
89+
call amdgpu_gfx addrspace(4) void null(<4 x float> %i15, i32 0)
90+
unreachable
91+
}
92+
93+
declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32 immarg)

0 commit comments

Comments
 (0)