diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 7d0c1ba8448e6..8c014832f5e46 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -522,13 +522,13 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, // the serialization easier. ReservedRegSet WWMReservedRegs; - using PrologEpilogSGPRSpillsMap = - DenseMap; + using PrologEpilogSGPRSpill = + std::pair; // To track the SGPR spill method used for a CSR SGPR register during // frame lowering. Even though the SGPR spills are handled during // SILowerSGPRSpills pass, some special handling needed later during the // PrologEpilogInserter. - PrologEpilogSGPRSpillsMap PrologEpilogSGPRSpills; + SmallVector PrologEpilogSGPRSpills; // To save/restore EXEC MASK around WWM spills and copies. Register SGPRForEXECCopy; @@ -596,7 +596,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, const WWMSpillsMap &getWWMSpills() const { return WWMSpills; } const ReservedRegSet &getWWMReservedRegs() const { return WWMReservedRegs; } - const PrologEpilogSGPRSpillsMap &getPrologEpilogSGPRSpills() const { + ArrayRef getPrologEpilogSGPRSpills() const { + assert( + is_sorted(PrologEpilogSGPRSpills, [](const auto &LHS, const auto &RHS) { + return LHS.first < RHS.first; + })); return PrologEpilogSGPRSpills; } @@ -606,18 +610,29 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, void addToPrologEpilogSGPRSpills(Register Reg, PrologEpilogSGPRSaveRestoreInfo SI) { - PrologEpilogSGPRSpills.insert(std::make_pair(Reg, SI)); + assert(!hasPrologEpilogSGPRSpillEntry(Reg)); + + // Insert a new entry in the right place to keep the vector in sorted order. + // This should be cheap since the vector is expected to be very short. + PrologEpilogSGPRSpills.insert( + upper_bound( + PrologEpilogSGPRSpills, Reg, + [](const auto &LHS, const auto &RHS) { return LHS < RHS.first; }), + std::make_pair(Reg, SI)); } // Check if an entry created for \p Reg in PrologEpilogSGPRSpills. Return true // on success and false otherwise. bool hasPrologEpilogSGPRSpillEntry(Register Reg) const { - return PrologEpilogSGPRSpills.contains(Reg); + auto I = find_if(PrologEpilogSGPRSpills, + [&Reg](const auto &Spill) { return Spill.first == Reg; }); + return I != PrologEpilogSGPRSpills.end(); } // Get the scratch SGPR if allocated to save/restore \p Reg. Register getScratchSGPRCopyDstReg(Register Reg) const { - auto I = PrologEpilogSGPRSpills.find(Reg); + auto I = find_if(PrologEpilogSGPRSpills, + [&Reg](const auto &Spill) { return Spill.first == Reg; }); if (I != PrologEpilogSGPRSpills.end() && I->second.getKind() == SGPRSaveKind::COPY_TO_SCRATCH_SGPR) return I->second.getReg(); @@ -646,7 +661,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, const PrologEpilogSGPRSaveRestoreInfo & getPrologEpilogSGPRSaveRestoreInfo(Register Reg) const { - auto I = PrologEpilogSGPRSpills.find(Reg); + auto I = find_if(PrologEpilogSGPRSpills, + [&Reg](const auto &Spill) { return Spill.first == Reg; }); assert(I != PrologEpilogSGPRSpills.end()); return I->second; diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll index 1be041c8dc9b0..fbe34a3a3970b 100644 --- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll @@ -32,11 +32,11 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: v_writelane_b32 v2, s24, 5 ; GFX906-NEXT: s_mov_b64 s[26:27], s[10:11] ; GFX906-NEXT: v_writelane_b32 v2, s26, 6 -; GFX906-NEXT: v_writelane_b32 v41, s34, 2 +; GFX906-NEXT: v_writelane_b32 v41, s16, 4 ; GFX906-NEXT: v_writelane_b32 v2, s27, 7 -; GFX906-NEXT: v_writelane_b32 v41, s35, 3 +; GFX906-NEXT: v_writelane_b32 v41, s34, 2 ; GFX906-NEXT: v_writelane_b32 v2, s8, 8 -; GFX906-NEXT: v_writelane_b32 v41, s16, 4 +; GFX906-NEXT: v_writelane_b32 v41, s35, 3 ; GFX906-NEXT: v_writelane_b32 v2, s9, 9 ; GFX906-NEXT: v_writelane_b32 v41, s30, 0 ; GFX906-NEXT: v_writelane_b32 v2, s4, 10 @@ -340,9 +340,9 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: v_readlane_b32 s31, v41, 1 ; GFX906-NEXT: v_readlane_b32 s30, v41, 0 ; GFX906-NEXT: ; kill: killed $vgpr40 +; GFX906-NEXT: v_readlane_b32 s4, v41, 4 ; GFX906-NEXT: v_readlane_b32 s34, v41, 2 ; GFX906-NEXT: v_readlane_b32 s35, v41, 3 -; GFX906-NEXT: v_readlane_b32 s4, v41, 4 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[30:33] offset:112 ; GFX906-NEXT: s_waitcnt vmcnt(0) @@ -383,12 +383,12 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: s_mov_b64 exec, -1 ; GFX908-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill ; GFX908-NEXT: s_mov_b64 exec, s[18:19] +; GFX908-NEXT: v_mov_b32_e32 v3, s16 +; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:168 ; 4-byte Folded Spill ; GFX908-NEXT: v_mov_b32_e32 v3, s34 ; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill ; GFX908-NEXT: v_mov_b32_e32 v3, s35 ; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:164 ; 4-byte Folded Spill -; GFX908-NEXT: v_mov_b32_e32 v3, s16 -; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:168 ; 4-byte Folded Spill ; GFX908-NEXT: s_addk_i32 s32, 0x2c00 ; GFX908-NEXT: s_mov_b64 s[16:17], exec ; GFX908-NEXT: s_mov_b64 exec, 1 @@ -753,16 +753,16 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:172 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_mov_b64 exec, s[4:5] -; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:160 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:168 ; 4-byte Folded Reload ; GFX908-NEXT: ; kill: killed $vgpr40 ; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_readfirstlane_b32 s4, v0 +; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:160 ; 4-byte Folded Reload +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_readfirstlane_b32 s34, v0 ; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:164 ; 4-byte Folded Reload ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_readfirstlane_b32 s35, v0 -; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:168 ; 4-byte Folded Reload -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_readfirstlane_b32 s4, v0 ; GFX908-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GFX908-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload ; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:156 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll index 4aeff3f23993a..be3c0d741ac5f 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -162,6 +162,7 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 { ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword [[VGPR_REG:v[0-9]+]], off, s[0:3], s33 offset:1028 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], [[FP_SCRATCH_COPY]], 2 ; GCN-NEXT: v_mov_b32_e32 v32, 0 ; GCN-DAG: v_writelane_b32 [[VGPR_REG]], s34, 3 ; GCN: s_mov_b32 s34, s32 @@ -169,14 +170,13 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 ; GCN-DAG: s_add_i32 s32, s32, 0x30000 -; GCN: v_writelane_b32 [[VGPR_REG]], [[FP_SCRATCH_COPY]], 2 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 ; GCN: s_swappc_b64 s[30:31], ; GCN: v_readlane_b32 s31, [[VGPR_REG]], 1 ; GCN: v_readlane_b32 s30, [[VGPR_REG]], 0 -; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG]], 3 ; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[VGPR_REG]], 2 +; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG]], 3 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword [[VGPR_REG]], off, s[0:3], s33 offset:1028 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] @@ -265,9 +265,9 @@ define void @no_free_regs_spill_bp_to_memory(<32 x i32> %a, i32 %b) #5 { ; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 ; GCN: s_xor_saveexec_b64 s[6:7], -1 ; GCN: buffer_store_dword v39, off, s[0:3], s33 -; GCN: v_mov_b32_e32 v0, s34 -; GCN: buffer_store_dword v0, off, s[0:3], s33 ; GCN: v_mov_b32_e32 v0, [[FP_SCRATCH_COPY]] +; GCN: buffer_store_dword v0, off, s[0:3], s33 +; GCN: v_mov_b32_e32 v0, s34 ; GCN-DAG: buffer_store_dword v0, off, s[0:3], s33 %local_val = alloca i32, align 128, addrspace(5) store volatile i32 %b, ptr addrspace(5) %local_val, align 128 @@ -304,13 +304,11 @@ define void @spill_bp_to_memory_scratch_reg_needed_mubuf_offset(<32 x i32> %a, i ; GCN-NEXT: s_add_i32 s5, s33, 0x42100 ; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s5 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: v_mov_b32_e32 v0, s34 -; GCN-NOT: v_mov_b32_e32 v0, 0x108c -; GCN-NEXT: s_add_i32 s5, s33, 0x42300 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill ; GCN-NEXT: v_mov_b32_e32 v0, [[FP_SCRATCH_COPY]] -; GCN-NOT: v_mov_b32_e32 v0, 0x1088 ; GCN-NEXT: s_add_i32 s5, s33, 0x42200 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v0, s34 +; GCN-NEXT: s_add_i32 s5, s33, 0x42300 ; GCN-NEXT: s_mov_b32 s34, s32 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill %local_val = alloca i32, align 128, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll index f680bbdd05cdd..8c285f37b4878 100644 --- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll @@ -18,9 +18,9 @@ define void @vector_reg_liverange_split() #0 { ; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX90A-NEXT: s_mov_b64 exec, s[18:19] +; GFX90A-NEXT: v_writelane_b32 v40, s16, 4 ; GFX90A-NEXT: v_writelane_b32 v40, s28, 2 ; GFX90A-NEXT: v_writelane_b32 v40, s29, 3 -; GFX90A-NEXT: v_writelane_b32 v40, s16, 4 ; GFX90A-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GFX90A-NEXT: v_writelane_b32 v40, s30, 0 ; GFX90A-NEXT: s_addk_i32 s32, 0x400 @@ -48,9 +48,9 @@ define void @vector_reg_liverange_split() #0 { ; GFX90A-NEXT: v_readlane_b32 s31, v40, 1 ; GFX90A-NEXT: v_readlane_b32 s30, v40, 0 ; GFX90A-NEXT: ; kill: killed $vgpr0 +; GFX90A-NEXT: v_readlane_b32 s4, v40, 4 ; GFX90A-NEXT: v_readlane_b32 s28, v40, 2 ; GFX90A-NEXT: v_readlane_b32 s29, v40, 3 -; GFX90A-NEXT: v_readlane_b32 s4, v40, 4 ; GFX90A-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, -1 diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll index 7eabe982ff2bc..5608ea8563548 100644 --- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll @@ -24,9 +24,9 @@ define void @test() #0 { ; GCN-NEXT: s_mov_b64 exec, -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: v_writelane_b32 v40, s16, 4 ; GCN-NEXT: v_writelane_b32 v40, s28, 2 ; GCN-NEXT: v_writelane_b32 v40, s29, 3 -; GCN-NEXT: v_writelane_b32 v40, s16, 4 ; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: s_addk_i32 s32, 0x800 @@ -55,9 +55,9 @@ define void @test() #0 { ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: v_readlane_b32 s4, v40, 4 ; GCN-NEXT: v_readlane_b32 s28, v40, 2 ; GCN-NEXT: v_readlane_b32 s29, v40, 3 -; GCN-NEXT: v_readlane_b32 s4, v40, 4 ; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload @@ -79,9 +79,9 @@ define void @test() #0 { ; GCN-O0-NEXT: s_mov_b64 exec, -1 ; GCN-O0-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[18:19] +; GCN-O0-NEXT: v_writelane_b32 v40, s16, 4 ; GCN-O0-NEXT: v_writelane_b32 v40, s28, 2 ; GCN-O0-NEXT: v_writelane_b32 v40, s29, 3 -; GCN-O0-NEXT: v_writelane_b32 v40, s16, 4 ; GCN-O0-NEXT: s_add_i32 s32, s32, 0x400 ; GCN-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GCN-O0-NEXT: v_writelane_b32 v40, s30, 0 @@ -117,9 +117,9 @@ define void @test() #0 { ; GCN-O0-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-O0-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-O0-NEXT: ; kill: killed $vgpr0 +; GCN-O0-NEXT: v_readlane_b32 s4, v40, 4 ; GCN-O0-NEXT: v_readlane_b32 s28, v40, 2 ; GCN-O0-NEXT: v_readlane_b32 s29, v40, 3 -; GCN-O0-NEXT: v_readlane_b32 s4, v40, 4 ; GCN-O0-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, -1