From 63224bb5b28f01fbda49f44e8733a5bd2710c640 Mon Sep 17 00:00:00 2001 From: pvanhout Date: Mon, 17 Jun 2024 09:40:14 +0200 Subject: [PATCH 1/3] [MachineLICM] Fix regression introduced by d4b8b72 --- llvm/lib/CodeGen/MachineLICM.cpp | 45 ++++++++++------------- llvm/test/CodeGen/AMDGPU/indirect-call.ll | 4 +- 2 files changed, 22 insertions(+), 27 deletions(-) diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp index 6c5170e918e00..237527a165e85 100644 --- a/llvm/lib/CodeGen/MachineLICM.cpp +++ b/llvm/lib/CodeGen/MachineLICM.cpp @@ -426,38 +426,33 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) { static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo &TRI, BitVector &RUs, const uint32_t *Mask) { - // Iterate over the RegMask raw to avoid constructing a BitVector, which is - // expensive as it implies dynamically allocating memory. - // - // We also work backwards. + const unsigned NumRUs = TRI.getNumRegUnits(); + BitVector RUsInMask(NumRUs); + BitVector RUsNotInMask(NumRUs); const unsigned NumRegs = TRI.getNumRegs(); const unsigned MaskWords = (NumRegs + 31) / 32; for (unsigned K = 0; K < MaskWords; ++K) { - // We want to set the bits that aren't in RegMask, so flip it. - uint32_t Word = ~Mask[K]; - - // Iterate all set bits, starting from the right. - while (Word) { - const unsigned SetBitIdx = countr_zero(Word); - - // The bits are numbered from the LSB in each word. - const unsigned PhysReg = (K * 32) + SetBitIdx; - - // Clear the bit at SetBitIdx. Doing it this way appears to generate less - // instructions on x86. This works because negating a number will flip all - // the bits after SetBitIdx. So (Word & -Word) == (1 << SetBitIdx), but - // faster. - Word ^= Word & -Word; - + uint32_t Word = Mask[K]; + for (unsigned Bit = 0; Bit < 32; ++Bit) { + const unsigned PhysReg = (K * 32) + Bit; if (PhysReg == NumRegs) - return; + break; - if (PhysReg) { - for (MCRegUnitIterator RUI(PhysReg, &TRI); RUI.isValid(); ++RUI) - RUs.set(*RUI); - } + if (!PhysReg) + continue; + + // Extract the bit and apply it to the appropriate mask. + auto &Mask = ((Word >> Bit) & 1) ? RUsInMask : RUsNotInMask; + for (MCRegUnitIterator RUI(PhysReg, &TRI); RUI.isValid(); ++RUI) + Mask.set(*RUI); } } + + // If a RU needs to be set because it's not in the RegMask, only set it + // if all registers from that RU are not in the mask either. + RUsNotInMask &= RUsInMask.flip(); + + RUs |= RUsNotInMask; } /// Examine the instruction for potentai LICM candidate. Also diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll index da8aa54469835..7799b9509ceb0 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -886,12 +886,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v40, s62, 30 ; GCN-NEXT: v_writelane_b32 v40, s63, 31 ; GCN-NEXT: s_mov_b64 s[6:7], exec +; GCN-NEXT: s_movk_i32 s4, 0x7b ; GCN-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s8, v0 ; GCN-NEXT: v_readfirstlane_b32 s9, v1 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] ; GCN-NEXT: s_and_saveexec_b64 s[10:11], vcc -; GCN-NEXT: s_movk_i32 s4, 0x7b ; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: s_xor_b64 exec, exec, s[10:11] @@ -980,12 +980,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s62, 30 ; GISEL-NEXT: v_writelane_b32 v40, s63, 31 ; GISEL-NEXT: s_mov_b64 s[6:7], exec +; GISEL-NEXT: s_movk_i32 s4, 0x7b ; GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s8, v0 ; GISEL-NEXT: v_readfirstlane_b32 s9, v1 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] ; GISEL-NEXT: s_and_saveexec_b64 s[10:11], vcc -; GISEL-NEXT: s_movk_i32 s4, 0x7b ; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: s_xor_b64 exec, exec, s[10:11] From 1b848f8dc6628212caac17c4f46ade5441b8c7f9 Mon Sep 17 00:00:00 2001 From: pvanhout Date: Mon, 17 Jun 2024 12:57:53 +0200 Subject: [PATCH 2/3] Comments --- llvm/lib/CodeGen/MachineLICM.cpp | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp index 237527a165e85..57fa77d540fe6 100644 --- a/llvm/lib/CodeGen/MachineLICM.cpp +++ b/llvm/lib/CodeGen/MachineLICM.cpp @@ -426,9 +426,7 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) { static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo &TRI, BitVector &RUs, const uint32_t *Mask) { - const unsigned NumRUs = TRI.getNumRegUnits(); - BitVector RUsInMask(NumRUs); - BitVector RUsNotInMask(NumRUs); + BitVector ClobberedRUs(TRI.getNumRegUnits(), true); const unsigned NumRegs = TRI.getNumRegs(); const unsigned MaskWords = (NumRegs + 31) / 32; for (unsigned K = 0; K < MaskWords; ++K) { @@ -438,21 +436,15 @@ static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo &TRI, if (PhysReg == NumRegs) break; - if (!PhysReg) - continue; - - // Extract the bit and apply it to the appropriate mask. - auto &Mask = ((Word >> Bit) & 1) ? RUsInMask : RUsNotInMask; - for (MCRegUnitIterator RUI(PhysReg, &TRI); RUI.isValid(); ++RUI) - Mask.set(*RUI); + // Check if we have a valid PhysReg that is set in the mask. + if (PhysReg && ((Word >> Bit) & 1)) { + for (MCRegUnitIterator RUI(PhysReg, &TRI); RUI.isValid(); ++RUI) + ClobberedRUs.reset(*RUI); + } } } - // If a RU needs to be set because it's not in the RegMask, only set it - // if all registers from that RU are not in the mask either. - RUsNotInMask &= RUsInMask.flip(); - - RUs |= RUsNotInMask; + RUs |= ClobberedRUs; } /// Examine the instruction for potentai LICM candidate. Also From e3f3daee0eac81d959bb107428bd4162be15539a Mon Sep 17 00:00:00 2001 From: pvanhout Date: Mon, 17 Jun 2024 13:40:59 +0200 Subject: [PATCH 3/3] skip empty words + add fixme --- llvm/lib/CodeGen/MachineLICM.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp index 57fa77d540fe6..1c76d72ed5152 100644 --- a/llvm/lib/CodeGen/MachineLICM.cpp +++ b/llvm/lib/CodeGen/MachineLICM.cpp @@ -430,13 +430,17 @@ static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo &TRI, const unsigned NumRegs = TRI.getNumRegs(); const unsigned MaskWords = (NumRegs + 31) / 32; for (unsigned K = 0; K < MaskWords; ++K) { - uint32_t Word = Mask[K]; + const uint32_t Word = Mask[K]; + if (!Word) + continue; + for (unsigned Bit = 0; Bit < 32; ++Bit) { const unsigned PhysReg = (K * 32) + Bit; if (PhysReg == NumRegs) break; // Check if we have a valid PhysReg that is set in the mask. + // FIXME: We shouldn't have to check for PhysReg. if (PhysReg && ((Word >> Bit) & 1)) { for (MCRegUnitIterator RUI(PhysReg, &TRI); RUI.isValid(); ++RUI) ClobberedRUs.reset(*RUI);