Skip to content

Commit 4c6dd70

Browse files
authored
[AMDGPU] Move INIT_EXEC lowering from SILowerControlFlow to SIWholeQuadMode (#94452)
NFCI; this just preserves SI_INIT_EXEC and SI_INIT_EXEC_FROM_INPUT instructions a little longer so that we can reliably identify them in SIWholeQuadMode.
1 parent f5d8c0e commit 4c6dd70

File tree

2 files changed

+102
-104
lines changed

2 files changed

+102
-104
lines changed

llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp

Lines changed: 0 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -103,8 +103,6 @@ class SILowerControlFlow : public MachineFunctionPass {
103103

104104
MachineBasicBlock *emitEndCf(MachineInstr &MI);
105105

106-
void lowerInitExec(MachineBasicBlock *MBB, MachineInstr &MI);
107-
108106
void findMaskOperands(MachineInstr &MI, unsigned OpNo,
109107
SmallVectorImpl<MachineOperand> &Src) const;
110108

@@ -709,95 +707,6 @@ MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) {
709707
return SplitBB;
710708
}
711709

712-
void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB,
713-
MachineInstr &MI) {
714-
MachineFunction &MF = *MBB->getParent();
715-
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
716-
bool IsWave32 = ST.isWave32();
717-
718-
if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
719-
// This should be before all vector instructions.
720-
MachineInstr *InitMI = BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
721-
TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), Exec)
722-
.addImm(MI.getOperand(0).getImm());
723-
if (LIS) {
724-
LIS->RemoveMachineInstrFromMaps(MI);
725-
LIS->InsertMachineInstrInMaps(*InitMI);
726-
}
727-
MI.eraseFromParent();
728-
return;
729-
}
730-
731-
// Extract the thread count from an SGPR input and set EXEC accordingly.
732-
// Since BFM can't shift by 64, handle that case with CMP + CMOV.
733-
//
734-
// S_BFE_U32 count, input, {shift, 7}
735-
// S_BFM_B64 exec, count, 0
736-
// S_CMP_EQ_U32 count, 64
737-
// S_CMOV_B64 exec, -1
738-
Register InputReg = MI.getOperand(0).getReg();
739-
MachineInstr *FirstMI = &*MBB->begin();
740-
if (InputReg.isVirtual()) {
741-
MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
742-
assert(DefInstr && DefInstr->isCopy());
743-
if (DefInstr->getParent() == MBB) {
744-
if (DefInstr != FirstMI) {
745-
// If the `InputReg` is defined in current block, we also need to
746-
// move that instruction to the beginning of the block.
747-
DefInstr->removeFromParent();
748-
MBB->insert(FirstMI, DefInstr);
749-
if (LIS)
750-
LIS->handleMove(*DefInstr);
751-
} else {
752-
// If first instruction is definition then move pointer after it.
753-
FirstMI = &*std::next(FirstMI->getIterator());
754-
}
755-
}
756-
}
757-
758-
// Insert instruction sequence at block beginning (before vector operations).
759-
const DebugLoc DL = MI.getDebugLoc();
760-
const unsigned WavefrontSize = ST.getWavefrontSize();
761-
const unsigned Mask = (WavefrontSize << 1) - 1;
762-
Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
763-
auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
764-
.addReg(InputReg)
765-
.addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
766-
if (LV)
767-
LV->recomputeForSingleDefVirtReg(InputReg);
768-
auto BfmMI =
769-
BuildMI(*MBB, FirstMI, DL,
770-
TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
771-
.addReg(CountReg)
772-
.addImm(0);
773-
auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
774-
.addReg(CountReg, RegState::Kill)
775-
.addImm(WavefrontSize);
776-
if (LV)
777-
LV->getVarInfo(CountReg).Kills.push_back(CmpMI);
778-
auto CmovMI =
779-
BuildMI(*MBB, FirstMI, DL,
780-
TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
781-
Exec)
782-
.addImm(-1);
783-
784-
if (!LIS) {
785-
MI.eraseFromParent();
786-
return;
787-
}
788-
789-
LIS->RemoveMachineInstrFromMaps(MI);
790-
MI.eraseFromParent();
791-
792-
LIS->InsertMachineInstrInMaps(*BfeMI);
793-
LIS->InsertMachineInstrInMaps(*BfmMI);
794-
LIS->InsertMachineInstrInMaps(*CmpMI);
795-
LIS->InsertMachineInstrInMaps(*CmovMI);
796-
797-
RecomputeRegs.insert(InputReg);
798-
LIS->createAndComputeVirtRegInterval(CountReg);
799-
}
800-
801710
bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
802711
for (auto &I : MBB.instrs()) {
803712
if (!I.isDebugInstr() && !I.isUnconditionalBranch())
@@ -927,18 +836,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
927836
SplitMBB = process(MI);
928837
Changed = true;
929838
break;
930-
931-
// FIXME: find a better place for this
932-
case AMDGPU::SI_INIT_EXEC:
933-
case AMDGPU::SI_INIT_EXEC_FROM_INPUT:
934-
lowerInitExec(MBB, MI);
935-
if (LIS)
936-
LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
937-
Changed = true;
938-
break;
939-
940-
default:
941-
break;
942839
}
943840

944841
if (SplitMBB != MBB) {

llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp

Lines changed: 102 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,7 @@ class SIWholeQuadMode : public MachineFunctionPass {
177177
SmallVector<MachineInstr *, 4> LowerToMovInstrs;
178178
SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
179179
SmallVector<MachineInstr *, 4> KillInstrs;
180+
SmallVector<MachineInstr *, 4> InitExecInstrs;
180181

181182
void printInfo();
182183

@@ -223,6 +224,8 @@ class SIWholeQuadMode : public MachineFunctionPass {
223224
void lowerLiveMaskQueries();
224225
void lowerCopyInstrs();
225226
void lowerKillInstrs(bool IsWQM);
227+
void lowerInitExec(MachineInstr &MI);
228+
void lowerInitExecInstrs();
226229

227230
public:
228231
static char ID;
@@ -580,6 +583,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
580583
Opcode == AMDGPU::SI_DEMOTE_I1) {
581584
KillInstrs.push_back(&MI);
582585
BBI.NeedsLowering = true;
586+
} else if (Opcode == AMDGPU::SI_INIT_EXEC ||
587+
Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) {
588+
InitExecInstrs.push_back(&MI);
583589
} else if (WQMOutputs) {
584590
// The function is in machine SSA form, which means that physical
585591
// VGPRs correspond to shader inputs and outputs. Inputs are
@@ -1556,6 +1562,97 @@ void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
15561562
}
15571563
}
15581564

1565+
void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
1566+
MachineBasicBlock *MBB = MI.getParent();
1567+
bool IsWave32 = ST->isWave32();
1568+
1569+
if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
1570+
// This should be before all vector instructions.
1571+
MachineInstr *InitMI =
1572+
BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
1573+
TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1574+
Exec)
1575+
.addImm(MI.getOperand(0).getImm());
1576+
if (LIS) {
1577+
LIS->RemoveMachineInstrFromMaps(MI);
1578+
LIS->InsertMachineInstrInMaps(*InitMI);
1579+
}
1580+
MI.eraseFromParent();
1581+
return;
1582+
}
1583+
1584+
// Extract the thread count from an SGPR input and set EXEC accordingly.
1585+
// Since BFM can't shift by 64, handle that case with CMP + CMOV.
1586+
//
1587+
// S_BFE_U32 count, input, {shift, 7}
1588+
// S_BFM_B64 exec, count, 0
1589+
// S_CMP_EQ_U32 count, 64
1590+
// S_CMOV_B64 exec, -1
1591+
Register InputReg = MI.getOperand(0).getReg();
1592+
MachineInstr *FirstMI = &*MBB->begin();
1593+
if (InputReg.isVirtual()) {
1594+
MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
1595+
assert(DefInstr && DefInstr->isCopy());
1596+
if (DefInstr->getParent() == MBB) {
1597+
if (DefInstr != FirstMI) {
1598+
// If the `InputReg` is defined in current block, we also need to
1599+
// move that instruction to the beginning of the block.
1600+
DefInstr->removeFromParent();
1601+
MBB->insert(FirstMI, DefInstr);
1602+
if (LIS)
1603+
LIS->handleMove(*DefInstr);
1604+
} else {
1605+
// If first instruction is definition then move pointer after it.
1606+
FirstMI = &*std::next(FirstMI->getIterator());
1607+
}
1608+
}
1609+
}
1610+
1611+
// Insert instruction sequence at block beginning (before vector operations).
1612+
const DebugLoc DL = MI.getDebugLoc();
1613+
const unsigned WavefrontSize = ST->getWavefrontSize();
1614+
const unsigned Mask = (WavefrontSize << 1) - 1;
1615+
Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
1616+
auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
1617+
.addReg(InputReg)
1618+
.addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
1619+
auto BfmMI =
1620+
BuildMI(*MBB, FirstMI, DL,
1621+
TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
1622+
.addReg(CountReg)
1623+
.addImm(0);
1624+
auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
1625+
.addReg(CountReg, RegState::Kill)
1626+
.addImm(WavefrontSize);
1627+
auto CmovMI =
1628+
BuildMI(*MBB, FirstMI, DL,
1629+
TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
1630+
Exec)
1631+
.addImm(-1);
1632+
1633+
if (!LIS) {
1634+
MI.eraseFromParent();
1635+
return;
1636+
}
1637+
1638+
LIS->RemoveMachineInstrFromMaps(MI);
1639+
MI.eraseFromParent();
1640+
1641+
LIS->InsertMachineInstrInMaps(*BfeMI);
1642+
LIS->InsertMachineInstrInMaps(*BfmMI);
1643+
LIS->InsertMachineInstrInMaps(*CmpMI);
1644+
LIS->InsertMachineInstrInMaps(*CmovMI);
1645+
1646+
LIS->removeInterval(InputReg);
1647+
LIS->createAndComputeVirtRegInterval(InputReg);
1648+
LIS->createAndComputeVirtRegInterval(CountReg);
1649+
}
1650+
1651+
void SIWholeQuadMode::lowerInitExecInstrs() {
1652+
for (MachineInstr *MI : InitExecInstrs)
1653+
lowerInitExec(*MI);
1654+
}
1655+
15591656
bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
15601657
LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
15611658
<< " ------------- \n");
@@ -1567,6 +1664,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
15671664
LowerToCopyInstrs.clear();
15681665
LowerToMovInstrs.clear();
15691666
KillInstrs.clear();
1667+
InitExecInstrs.clear();
15701668
StateTransition.clear();
15711669

15721670
ST = &MF.getSubtarget<GCNSubtarget>();
@@ -1606,10 +1704,13 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
16061704
// Shader is simple does not need any state changes or any complex lowering
16071705
if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
16081706
LowerToMovInstrs.empty() && KillInstrs.empty()) {
1707+
lowerInitExecInstrs();
16091708
lowerLiveMaskQueries();
1610-
return !LiveMaskQueries.empty();
1709+
return !InitExecInstrs.empty() || !LiveMaskQueries.empty();
16111710
}
16121711

1712+
lowerInitExecInstrs();
1713+
16131714
MachineBasicBlock &Entry = MF.front();
16141715
MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
16151716

0 commit comments

Comments
 (0)