diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 5b4c44302fa62..913942dda19d9 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -225,7 +225,7 @@ class SIWholeQuadMode : public MachineFunctionPass { void lowerCopyInstrs(); void lowerKillInstrs(bool IsWQM); void lowerInitExec(MachineInstr &MI); - void lowerInitExecInstrs(); + MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry); public: static char ID; @@ -1648,9 +1648,23 @@ void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) { LIS->createAndComputeVirtRegInterval(CountReg); } -void SIWholeQuadMode::lowerInitExecInstrs() { - for (MachineInstr *MI : InitExecInstrs) +/// Lower INIT_EXEC instructions. Return a suitable insert point in \p Entry +/// for instructions that depend on EXEC. +MachineBasicBlock::iterator +SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry) { + MachineBasicBlock::iterator InsertPt = Entry.getFirstNonPHI(); + + for (MachineInstr *MI : InitExecInstrs) { + // Try to handle undefined cases gracefully: + // - multiple INIT_EXEC instructions + // - INIT_EXEC instructions not in the entry block + if (MI->getParent() == &Entry) + InsertPt = std::next(MI->getIterator()); + lowerInitExec(*MI); + } + + return InsertPt; } bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { @@ -1701,19 +1715,16 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { LiveMaskReg = Exec; + MachineBasicBlock &Entry = MF.front(); + MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry); + // Shader is simple does not need any state changes or any complex lowering if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() && LowerToMovInstrs.empty() && KillInstrs.empty()) { - lowerInitExecInstrs(); lowerLiveMaskQueries(); return !InitExecInstrs.empty() || !LiveMaskQueries.empty(); } - lowerInitExecInstrs(); - - MachineBasicBlock &Entry = MF.front(); - MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI(); - // Store a copy of the original live mask when required if (NeedsLiveMask || (GlobalFlags & StateWQM)) { LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC()); diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index 6fcf5067b0225..3bf6c104a0254 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -3395,6 +3395,52 @@ main_body: ret void } +; Test the interaction between wqm and llvm.amdgcn.init.exec. +define amdgpu_gs void @wqm_init_exec() { +; GFX9-W64-LABEL: wqm_init_exec: +; GFX9-W64: ; %bb.0: ; %bb +; GFX9-W64-NEXT: s_mov_b64 exec, -1 +; GFX9-W64-NEXT: s_mov_b32 s0, 0 +; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-W64-NEXT: s_mov_b32 s1, s0 +; GFX9-W64-NEXT: s_mov_b32 s2, s0 +; GFX9-W64-NEXT: s_mov_b32 s3, s0 +; GFX9-W64-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-W64-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $exec +; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-W64-NEXT: ds_write_b32 v0, v1 +; GFX9-W64-NEXT: s_endpgm +; +; GFX10-W32-LABEL: wqm_init_exec: +; GFX10-W32: ; %bb.0: ; %bb +; GFX10-W32-NEXT: s_mov_b32 exec_lo, -1 +; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo +; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-W32-NEXT: s_mov_b32 s0, 0 +; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: s_mov_b32 s2, s0 +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s1 +; GFX10-W32-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-W32-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-W32-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-W32-NEXT: s_mov_b32 s1, s0 +; GFX10-W32-NEXT: s_mov_b32 s3, s0 +; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX10-W32-NEXT: ds_write_b32 v0, v4 +; GFX10-W32-NEXT: s_endpgm +bb: + call void @llvm.amdgcn.init.exec(i64 -1) + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> zeroinitializer, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0) + %i = call i32 @llvm.amdgcn.wqm.i32(i32 0) + store i32 %i, i32 addrspace(3)* null, align 4 + ret void +} + declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1