Skip to content

Commit 2fa7d65

Browse files
AMDGPU: Fix temporal divergence introduced by machine-sink (#67456)
Temporal divergence that was present in input or introduced in IR transforms, like code-sinking or LICM, is handled in SIFixSGPRCopies by changing sgpr source instr to vgpr instr. After 5b657f5, that moved LICM after AMDGPUCodeGenPrepare, machine-sinking can introduce temporal divergence by sinking instructions outside of the cycle. Add isSafeToSink callback in TargetInstrInfo.
1 parent 2d7fe90 commit 2fa7d65

10 files changed

+96
-6
lines changed

llvm/include/llvm/ADT/GenericCycleImpl.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,25 @@ void GenericCycleInfo<ContextT>::compute(FunctionT &F) {
363363
assert(validateTree());
364364
}
365365

366+
template <typename ContextT>
367+
void GenericCycleInfo<ContextT>::splitCriticalEdge(BlockT *Pred, BlockT *Succ,
368+
BlockT *NewBlock) {
369+
// Edge Pred-Succ is replaced by edges Pred-NewBlock and NewBlock-Succ, all
370+
// cycles that had blocks Pred and Succ also get NewBlock.
371+
CycleT *Cycle = this->getCycle(Pred);
372+
if (Cycle && Cycle->contains(Succ)) {
373+
while (Cycle) {
374+
// FixMe: Appending NewBlock is fine as a set of blocks in a cycle. When
375+
// printing cycle NewBlock is at the end of list but it should be in the
376+
// middle to represent actual traversal of a cycle.
377+
Cycle->appendBlock(NewBlock);
378+
BlockMap.try_emplace(NewBlock, Cycle);
379+
Cycle = Cycle->getParentCycle();
380+
}
381+
}
382+
assert(validateTree());
383+
}
384+
366385
/// \brief Find the innermost cycle containing a given block.
367386
///
368387
/// \returns the innermost cycle containing \p Block or nullptr if

llvm/include/llvm/ADT/GenericCycleInfo.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,7 @@ template <typename ContextT> class GenericCycleInfo {
255255

256256
void clear();
257257
void compute(FunctionT &F);
258+
void splitCriticalEdge(BlockT *Pred, BlockT *Succ, BlockT *New);
258259

259260
const FunctionT *getFunction() const { return Context.getFunction(); }
260261
const ContextT &getSSAContext() const { return Context; }

llvm/include/llvm/CodeGen/MachineBasicBlock.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -794,6 +794,15 @@ class MachineBasicBlock
794794
static_cast<const MachineBasicBlock *>(this)->getSingleSuccessor());
795795
}
796796

797+
/// Return the predecessor of this block if it has a single predecessor.
798+
/// Otherwise return a null pointer.
799+
///
800+
const MachineBasicBlock *getSinglePredecessor() const;
801+
MachineBasicBlock *getSinglePredecessor() {
802+
return const_cast<MachineBasicBlock *>(
803+
static_cast<const MachineBasicBlock *>(this)->getSinglePredecessor());
804+
}
805+
797806
/// Return the fallthrough block if the block can implicitly
798807
/// transfer control to the block after it by falling off the end of
799808
/// it. If an explicit branch to the fallthrough block is not allowed,

llvm/include/llvm/CodeGen/TargetInstrInfo.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "llvm/ADT/Uniformity.h"
2020
#include "llvm/CodeGen/MIRFormatter.h"
2121
#include "llvm/CodeGen/MachineBasicBlock.h"
22+
#include "llvm/CodeGen/MachineCycleAnalysis.h"
2223
#include "llvm/CodeGen/MachineFunction.h"
2324
#include "llvm/CodeGen/MachineInstr.h"
2425
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -150,6 +151,11 @@ class TargetInstrInfo : public MCInstrInfo {
150151
return false;
151152
}
152153

154+
virtual bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo,
155+
MachineCycleInfo *CI) const {
156+
return true;
157+
}
158+
153159
protected:
154160
/// For instructions with opcodes for which the M_REMATERIALIZABLE flag is
155161
/// set, this hook lets the target specify whether the instruction is actually

llvm/lib/CodeGen/MachineBasicBlock.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -960,6 +960,10 @@ const MachineBasicBlock *MachineBasicBlock::getSingleSuccessor() const {
960960
return Successors.size() == 1 ? Successors[0] : nullptr;
961961
}
962962

963+
const MachineBasicBlock *MachineBasicBlock::getSinglePredecessor() const {
964+
return Predecessors.size() == 1 ? Predecessors[0] : nullptr;
965+
}
966+
963967
MachineBasicBlock *MachineBasicBlock::getFallThrough(bool JumpToFallThrough) {
964968
MachineFunction::iterator Fallthrough = getIterator();
965969
++Fallthrough;

llvm/lib/CodeGen/MachineSink.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -735,6 +735,7 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
735735

736736
MadeChange = true;
737737
++NumSplit;
738+
CI->splitCriticalEdge(Pair.first, Pair.second, NewSucc);
738739
} else
739740
LLVM_DEBUG(dbgs() << " *** Not legal to break critical edge\n");
740741
}
@@ -1263,6 +1264,9 @@ MachineSinking::FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB,
12631264
if (SuccToSinkTo && SuccToSinkTo->isInlineAsmBrIndirectTarget())
12641265
return nullptr;
12651266

1267+
if (SuccToSinkTo && !TII->isSafeToSink(MI, SuccToSinkTo, CI))
1268+
return nullptr;
1269+
12661270
return SuccToSinkTo;
12671271
}
12681272

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,48 @@ bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const {
171171
isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
172172
}
173173

174+
bool SIInstrInfo::isSafeToSink(MachineInstr &MI,
175+
MachineBasicBlock *SuccToSinkTo,
176+
MachineCycleInfo *CI) const {
177+
// Allow sinking if MI edits lane mask (divergent i1 in sgpr).
178+
if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
179+
return true;
180+
181+
MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
182+
// Check if sinking of MI would create temporal divergent use.
183+
for (auto Op : MI.uses()) {
184+
if (Op.isReg() && Op.getReg().isVirtual() &&
185+
RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
186+
MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
187+
188+
// SgprDef defined inside cycle
189+
MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
190+
if (FromCycle == nullptr)
191+
continue;
192+
193+
MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
194+
// Check if there is a FromCycle that contains SgprDef's basic block but
195+
// does not contain SuccToSinkTo and also has divergent exit condition.
196+
while (FromCycle && !FromCycle->contains(ToCycle)) {
197+
// After structurize-cfg, there should be exactly one cycle exit.
198+
SmallVector<MachineBasicBlock *, 1> ExitBlocks;
199+
FromCycle->getExitBlocks(ExitBlocks);
200+
assert(ExitBlocks.size() == 1);
201+
assert(ExitBlocks[0]->getSinglePredecessor());
202+
203+
// FromCycle has divergent exit condition.
204+
if (hasDivergentBranch(ExitBlocks[0]->getSinglePredecessor())) {
205+
return false;
206+
}
207+
208+
FromCycle = FromCycle->getParentCycle();
209+
}
210+
}
211+
}
212+
213+
return true;
214+
}
215+
174216
bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
175217
int64_t &Offset0,
176218
int64_t &Offset1) const {

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
222222

223223
bool isIgnorableUse(const MachineOperand &MO) const override;
224224

225+
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo,
226+
MachineCycleInfo *CI) const override;
227+
225228
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0,
226229
int64_t &Offset1) const override;
227230

llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
167167
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s59
168168
; CHECK-NEXT: s_add_i32 s58, s58, 4
169169
; CHECK-NEXT: s_add_i32 s4, s55, s58
170+
; CHECK-NEXT: v_add_nc_u32_e32 v0, s58, v57
170171
; CHECK-NEXT: s_add_i32 s5, s4, 5
171172
; CHECK-NEXT: s_add_i32 s4, s4, 1
172173
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s5, v42
@@ -267,7 +268,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
267268
; CHECK-NEXT: .LBB0_16: ; %Flow43
268269
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
269270
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s57
270-
; CHECK-NEXT: v_add_nc_u32_e32 v57, s58, v57
271+
; CHECK-NEXT: v_mov_b32_e32 v57, v0
271272
; CHECK-NEXT: .LBB0_17: ; %Flow44
272273
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
273274
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s56
@@ -869,6 +870,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
869870
; CHECK-NEXT: s_add_i32 s7, s7, 4
870871
; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43
871872
; CHECK-NEXT: s_add_i32 s8, s4, s7
873+
; CHECK-NEXT: v_add_nc_u32_e32 v0, s7, v47
872874
; CHECK-NEXT: s_add_i32 s9, s8, 5
873875
; CHECK-NEXT: s_add_i32 s8, s8, 1
874876
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s9, v41
@@ -879,7 +881,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
879881
; CHECK-NEXT: ; %bb.4: ; %Flow3
880882
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
881883
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
882-
; CHECK-NEXT: v_add_nc_u32_e32 v47, s7, v47
884+
; CHECK-NEXT: v_mov_b32_e32 v47, v0
883885
; CHECK-NEXT: .LBB1_5: ; %Flow4
884886
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
885887
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5

llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.mir

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ body: |
2222
; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_1]], %bb.0, %6, %bb.1
2323
; CHECK-NEXT: [[PHI1:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %8, %bb.1
2424
; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI1]], [[S_MOV_B32_2]], implicit-def dead $scc
25+
; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[S_ADD_I32_]], [[S_ADD_I32_]], 0, implicit $exec
2526
; CHECK-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[S_ADD_I32_]], 0, 0, implicit $mode, implicit $exec
2627
; CHECK-NEXT: [[V_CMP_GT_F32_e64_:%[0-9]+]]:sreg_32 = nofpexcept V_CMP_GT_F32_e64 0, killed [[V_CVT_F32_U32_e64_]], 0, [[COPY]], 0, implicit $mode, implicit $exec
2728
; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK killed [[V_CMP_GT_F32_e64_]], [[PHI]], implicit-def dead $scc
@@ -30,7 +31,6 @@ body: |
3031
; CHECK-NEXT: {{ $}}
3132
; CHECK-NEXT: bb.2:
3233
; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
33-
; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[S_ADD_I32_]], [[S_ADD_I32_]], 0, implicit $exec
3434
; CHECK-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
3535
; CHECK-NEXT: SI_RETURN
3636
bb.0:
@@ -83,6 +83,9 @@ body: |
8383
; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_1]], %bb.0, %6, %bb.1
8484
; CHECK-NEXT: [[PHI1:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %8, %bb.1
8585
; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI1]], [[S_MOV_B32_2]], implicit-def dead $scc
86+
; CHECK-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_]], [[S_MOV_B32_2]], implicit-def dead $scc
87+
; CHECK-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_1]], [[S_MOV_B32_2]], implicit-def dead $scc
88+
; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[S_ADD_I32_2]], [[S_ADD_I32_2]], 0, implicit $exec
8689
; CHECK-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[S_ADD_I32_]], 0, 0, implicit $mode, implicit $exec
8790
; CHECK-NEXT: [[V_CMP_GT_F32_e64_:%[0-9]+]]:sreg_32 = nofpexcept V_CMP_GT_F32_e64 0, killed [[V_CVT_F32_U32_e64_]], 0, [[COPY]], 0, implicit $mode, implicit $exec
8891
; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK killed [[V_CMP_GT_F32_e64_]], [[PHI]], implicit-def dead $scc
@@ -91,9 +94,6 @@ body: |
9194
; CHECK-NEXT: {{ $}}
9295
; CHECK-NEXT: bb.2:
9396
; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
94-
; CHECK-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_]], [[S_MOV_B32_2]], implicit-def dead $scc
95-
; CHECK-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_1]], [[S_MOV_B32_2]], implicit-def dead $scc
96-
; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[S_ADD_I32_2]], [[S_ADD_I32_2]], 0, implicit $exec
9797
; CHECK-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
9898
; CHECK-NEXT: SI_RETURN
9999
bb.0:

0 commit comments

Comments
 (0)