diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index eebc33aea7a86..ddb0c10604537 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -378,6 +378,13 @@ class AMDGPUMarkLastScratchLoadPass MachineFunctionAnalysisManager &AM); }; +class SIInsertWaitcntsPass : public PassInfoMixin { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); + static bool isRequired() { return true; } +}; + FunctionPass *createAMDGPUAnnotateUniformValuesLegacy(); ModulePass *createAMDGPUPrintfRuntimeBinding(); @@ -454,7 +461,7 @@ extern char &AMDGPUInsertDelayAluID; void initializeSIInsertHardClausesPass(PassRegistry &); extern char &SIInsertHardClausesID; -void initializeSIInsertWaitcntsPass(PassRegistry&); +void initializeSIInsertWaitcntsLegacyPass(PassRegistry &); extern char &SIInsertWaitcntsID; void initializeSIFormMemoryClausesLegacyPass(PassRegistry &); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 388c390edad6a..68f2321432402 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -111,6 +111,7 @@ MACHINE_FUNCTION_PASS("si-fix-vgpr-copies", SIFixVGPRCopiesPass()) MACHINE_FUNCTION_PASS("si-fold-operands", SIFoldOperandsPass()); MACHINE_FUNCTION_PASS("si-form-memory-clauses", SIFormMemoryClausesPass()) MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass()) +MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass()) MACHINE_FUNCTION_PASS("si-load-store-opt", SILoadStoreOptimizerPass()) MACHINE_FUNCTION_PASS("si-lower-control-flow", SILowerControlFlowPass()) MACHINE_FUNCTION_PASS("si-lower-sgpr-spills", SILowerSGPRSpillsPass()) @@ -133,7 +134,6 @@ DUMMY_MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartial DUMMY_MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass()) DUMMY_MACHINE_FUNCTION_PASS("si-insert-hard-clauses", SIInsertHardClausesPass()) -DUMMY_MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass()) DUMMY_MACHINE_FUNCTION_PASS("si-late-branch-lowering", SILateBranchLoweringPass()) DUMMY_MACHINE_FUNCTION_PASS("si-pre-emit-peephole", SIPreEmitPeepholePass()) // TODO: Move amdgpu-preload-kern-arg-prolog to MACHINE_FUNCTION_PASS since it diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 827216f8fde59..0eb0594d87dac 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -536,7 +536,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSIAnnotateControlFlowLegacyPass(*PR); initializeAMDGPUInsertDelayAluLegacyPass(*PR); initializeSIInsertHardClausesPass(*PR); - initializeSIInsertWaitcntsPass(*PR); + initializeSIInsertWaitcntsLegacyPass(*PR); initializeSIModeRegisterLegacyPass(*PR); initializeSIWholeQuadModeLegacyPass(*PR); initializeSILowerControlFlowLegacyPass(*PR); @@ -2158,7 +2158,7 @@ void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const { } addPass(SIMemoryLegalizerPass()); - // TODO: addPass(SIInsertWaitcntsPass()); + addPass(SIInsertWaitcntsPass()); // TODO: addPass(SIModeRegisterPass()); diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 5906b1caeed3d..0edacadc4884f 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -33,6 +33,7 @@ #include "llvm/ADT/Sequence.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachinePassManager.h" #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/Support/DebugCounter.h" #include "llvm/TargetParser/TargetParser.h" @@ -597,7 +598,7 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override; }; -class SIInsertWaitcnts : public MachineFunctionPass { +class SIInsertWaitcnts { private: const GCNSubtarget *ST = nullptr; const SIInstrInfo *TII = nullptr; @@ -636,9 +637,9 @@ class SIInsertWaitcnts : public MachineFunctionPass { InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS; public: - static char ID; - - SIInsertWaitcnts() : MachineFunctionPass(ID) { + SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT, + AliasAnalysis *AA) + : MLI(MLI), PDT(PDT), AA(AA) { (void)ForceExpCounter; (void)ForceLgkmCounter; (void)ForceVMCounter; @@ -648,20 +649,7 @@ class SIInsertWaitcnts : public MachineFunctionPass { bool isPreheaderToFlush(MachineBasicBlock &MBB, WaitcntBrackets &ScoreBrackets); bool isVMEMOrFlatVMEM(const MachineInstr &MI) const; - bool runOnMachineFunction(MachineFunction &MF) override; - - StringRef getPassName() const override { - return "SI insert wait instructions"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired(); - AU.addRequired(); - AU.addUsedIfAvailable(); - AU.addPreserved(); - MachineFunctionPass::getAnalysisUsage(AU); - } + bool run(MachineFunction &MF); bool isForceEmitWaitcnt() const { for (auto T : inst_counter_types()) @@ -749,6 +737,27 @@ class SIInsertWaitcnts : public MachineFunctionPass { WaitcntBrackets &ScoreBrackets); }; +class SIInsertWaitcntsLegacy : public MachineFunctionPass { +public: + static char ID; + SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "SI insert wait instructions"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + AU.addRequired(); + AU.addUsedIfAvailable(); + AU.addPreserved(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + } // end anonymous namespace RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, @@ -1133,19 +1142,19 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { return hasMixedPendingEvents(T); } -INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, - false) +INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts", + false, false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) -INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, - false) +INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts", + false, false) -char SIInsertWaitcnts::ID = 0; +char SIInsertWaitcntsLegacy::ID = 0; -char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID; +char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID; FunctionPass *llvm::createSIInsertWaitcntsPass() { - return new SIInsertWaitcnts(); + return new SIInsertWaitcntsLegacy(); } static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, @@ -2481,16 +2490,40 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder(); } -bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { +bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) { + auto *MLI = &getAnalysis().getLI(); + auto *PDT = + &getAnalysis().getPostDomTree(); + AliasAnalysis *AA = nullptr; + if (auto *AAR = getAnalysisIfAvailable()) + AA = &AAR->getAAResults(); + + return SIInsertWaitcnts(MLI, PDT, AA).run(MF); +} + +PreservedAnalyses +SIInsertWaitcntsPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + auto *MLI = &MFAM.getResult(MF); + auto *PDT = &MFAM.getResult(MF); + auto *AA = MFAM.getResult(MF) + .getManager() + .getCachedResult(MF.getFunction()); + + if (!SIInsertWaitcnts(MLI, PDT, AA).run(MF)) + return PreservedAnalyses::all(); + + return getMachineFunctionPassPreservedAnalyses() + .preserveSet() + .preserve(); +} + +bool SIInsertWaitcnts::run(MachineFunction &MF) { ST = &MF.getSubtarget(); TII = ST->getInstrInfo(); TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); const SIMachineFunctionInfo *MFI = MF.getInfo(); - MLI = &getAnalysis().getLI(); - PDT = &getAnalysis().getPostDomTree(); - if (auto *AAR = getAnalysisIfAvailable()) - AA = &AAR->getAAResults(); AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU()); diff --git a/llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir b/llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir index b6dc75db3edc1..f776c22866296 100644 --- a/llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir +++ b/llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes=si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s # $sgpr30_sgpr31 will hold the return address. We need a waitcnt before SI_CALL so # that the return address is not clobbered in the callee by the outstanding load. diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-hang.mir b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-hang.mir index 28d79efc00b0d..2834ca5fa6858 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-hang.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-hang.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass si-insert-waitcnts %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -passes si-insert-waitcnts %s -o - | FileCheck %s --- name: test diff --git a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir index 17e3d93ed393b..f5321591a3c88 100644 --- a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir +++ b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir @@ -2,6 +2,8 @@ # RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx900 -o - %s | FileCheck %s -check-prefixes=CHECK,GFX9 # RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -o - %s | FileCheck %s # RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -o - %s | FileCheck %s + +# RUN: llc -passes=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -o - %s | FileCheck %s --- # CHECK-LABEL: name: vccz_corrupt_workaround # CHECK: $vcc = V_CMP_EQ_F32