@@ -177,6 +177,7 @@ class SIWholeQuadMode : public MachineFunctionPass {
177
177
SmallVector<MachineInstr *, 4 > LowerToMovInstrs;
178
178
SmallVector<MachineInstr *, 4 > LowerToCopyInstrs;
179
179
SmallVector<MachineInstr *, 4 > KillInstrs;
180
+ SmallVector<MachineInstr *, 4 > InitExecInstrs;
180
181
181
182
void printInfo ();
182
183
@@ -223,6 +224,8 @@ class SIWholeQuadMode : public MachineFunctionPass {
223
224
void lowerLiveMaskQueries ();
224
225
void lowerCopyInstrs ();
225
226
void lowerKillInstrs (bool IsWQM);
227
+ void lowerInitExec (MachineInstr &MI);
228
+ void lowerInitExecInstrs ();
226
229
227
230
public:
228
231
static char ID;
@@ -580,6 +583,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
580
583
Opcode == AMDGPU::SI_DEMOTE_I1) {
581
584
KillInstrs.push_back (&MI);
582
585
BBI.NeedsLowering = true ;
586
+ } else if (Opcode == AMDGPU::SI_INIT_EXEC ||
587
+ Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) {
588
+ InitExecInstrs.push_back (&MI);
583
589
} else if (WQMOutputs) {
584
590
// The function is in machine SSA form, which means that physical
585
591
// VGPRs correspond to shader inputs and outputs. Inputs are
@@ -1556,6 +1562,97 @@ void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
1556
1562
}
1557
1563
}
1558
1564
1565
+ void SIWholeQuadMode::lowerInitExec (MachineInstr &MI) {
1566
+ MachineBasicBlock *MBB = MI.getParent ();
1567
+ bool IsWave32 = ST->isWave32 ();
1568
+
1569
+ if (MI.getOpcode () == AMDGPU::SI_INIT_EXEC) {
1570
+ // This should be before all vector instructions.
1571
+ MachineInstr *InitMI =
1572
+ BuildMI (*MBB, MBB->begin (), MI.getDebugLoc (),
1573
+ TII->get (IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1574
+ Exec)
1575
+ .addImm (MI.getOperand (0 ).getImm ());
1576
+ if (LIS) {
1577
+ LIS->RemoveMachineInstrFromMaps (MI);
1578
+ LIS->InsertMachineInstrInMaps (*InitMI);
1579
+ }
1580
+ MI.eraseFromParent ();
1581
+ return ;
1582
+ }
1583
+
1584
+ // Extract the thread count from an SGPR input and set EXEC accordingly.
1585
+ // Since BFM can't shift by 64, handle that case with CMP + CMOV.
1586
+ //
1587
+ // S_BFE_U32 count, input, {shift, 7}
1588
+ // S_BFM_B64 exec, count, 0
1589
+ // S_CMP_EQ_U32 count, 64
1590
+ // S_CMOV_B64 exec, -1
1591
+ Register InputReg = MI.getOperand (0 ).getReg ();
1592
+ MachineInstr *FirstMI = &*MBB->begin ();
1593
+ if (InputReg.isVirtual ()) {
1594
+ MachineInstr *DefInstr = MRI->getVRegDef (InputReg);
1595
+ assert (DefInstr && DefInstr->isCopy ());
1596
+ if (DefInstr->getParent () == MBB) {
1597
+ if (DefInstr != FirstMI) {
1598
+ // If the `InputReg` is defined in current block, we also need to
1599
+ // move that instruction to the beginning of the block.
1600
+ DefInstr->removeFromParent ();
1601
+ MBB->insert (FirstMI, DefInstr);
1602
+ if (LIS)
1603
+ LIS->handleMove (*DefInstr);
1604
+ } else {
1605
+ // If first instruction is definition then move pointer after it.
1606
+ FirstMI = &*std::next (FirstMI->getIterator ());
1607
+ }
1608
+ }
1609
+ }
1610
+
1611
+ // Insert instruction sequence at block beginning (before vector operations).
1612
+ const DebugLoc DL = MI.getDebugLoc ();
1613
+ const unsigned WavefrontSize = ST->getWavefrontSize ();
1614
+ const unsigned Mask = (WavefrontSize << 1 ) - 1 ;
1615
+ Register CountReg = MRI->createVirtualRegister (&AMDGPU::SGPR_32RegClass);
1616
+ auto BfeMI = BuildMI (*MBB, FirstMI, DL, TII->get (AMDGPU::S_BFE_U32), CountReg)
1617
+ .addReg (InputReg)
1618
+ .addImm ((MI.getOperand (1 ).getImm () & Mask) | 0x70000 );
1619
+ auto BfmMI =
1620
+ BuildMI (*MBB, FirstMI, DL,
1621
+ TII->get (IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
1622
+ .addReg (CountReg)
1623
+ .addImm (0 );
1624
+ auto CmpMI = BuildMI (*MBB, FirstMI, DL, TII->get (AMDGPU::S_CMP_EQ_U32))
1625
+ .addReg (CountReg, RegState::Kill)
1626
+ .addImm (WavefrontSize);
1627
+ auto CmovMI =
1628
+ BuildMI (*MBB, FirstMI, DL,
1629
+ TII->get (IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
1630
+ Exec)
1631
+ .addImm (-1 );
1632
+
1633
+ if (!LIS) {
1634
+ MI.eraseFromParent ();
1635
+ return ;
1636
+ }
1637
+
1638
+ LIS->RemoveMachineInstrFromMaps (MI);
1639
+ MI.eraseFromParent ();
1640
+
1641
+ LIS->InsertMachineInstrInMaps (*BfeMI);
1642
+ LIS->InsertMachineInstrInMaps (*BfmMI);
1643
+ LIS->InsertMachineInstrInMaps (*CmpMI);
1644
+ LIS->InsertMachineInstrInMaps (*CmovMI);
1645
+
1646
+ LIS->removeInterval (InputReg);
1647
+ LIS->createAndComputeVirtRegInterval (InputReg);
1648
+ LIS->createAndComputeVirtRegInterval (CountReg);
1649
+ }
1650
+
1651
+ void SIWholeQuadMode::lowerInitExecInstrs () {
1652
+ for (MachineInstr *MI : InitExecInstrs)
1653
+ lowerInitExec (*MI);
1654
+ }
1655
+
1559
1656
bool SIWholeQuadMode::runOnMachineFunction (MachineFunction &MF) {
1560
1657
LLVM_DEBUG (dbgs () << " SI Whole Quad Mode on " << MF.getName ()
1561
1658
<< " ------------- \n " );
@@ -1567,6 +1664,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
1567
1664
LowerToCopyInstrs.clear ();
1568
1665
LowerToMovInstrs.clear ();
1569
1666
KillInstrs.clear ();
1667
+ InitExecInstrs.clear ();
1570
1668
StateTransition.clear ();
1571
1669
1572
1670
ST = &MF.getSubtarget <GCNSubtarget>();
@@ -1606,10 +1704,13 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
1606
1704
// Shader is simple does not need any state changes or any complex lowering
1607
1705
if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty () &&
1608
1706
LowerToMovInstrs.empty () && KillInstrs.empty ()) {
1707
+ lowerInitExecInstrs ();
1609
1708
lowerLiveMaskQueries ();
1610
- return !LiveMaskQueries.empty ();
1709
+ return !InitExecInstrs. empty () || ! LiveMaskQueries.empty ();
1611
1710
}
1612
1711
1712
+ lowerInitExecInstrs ();
1713
+
1613
1714
MachineBasicBlock &Entry = MF.front ();
1614
1715
MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI ();
1615
1716
0 commit comments