Skip to content

Commit b6bf402

Browse files
huaatiankaiyan96clearlov7e77
authored
[llvm][CodeGen] Add a new software pipeliner 'Window Scheduler' (#84443)
This commit implements the Window Scheduler as described in the RFC: https://discourse.llvm.org/t/rfc-window-scheduling-algorithm-for-machinepipeliner-in-llvm/74718 This Window Scheduler implements the window algorithm designed by Steven Muchnick in the book "Advanced Compiler Design And Implementation", with some improvements: 1. Copy 3 times of the loop kernel and construct the corresponding DAG to identify dependencies between MIs; 2. Use heuristic algorithm to obtain a set of window offsets. The window algorithm is equivalent to modulo scheduling algorithm with a stage of 2. It is mainly applied in targets where hardware resource conflicts are severe, and the SMS algorithm often fails in such cases. On our own DSA, this window algorithm typically can achieve a performance improvement of over 10%. Co-authored-by: Kai Yan <[email protected]> Co-authored-by: Ran Xiao <[email protected]> --------- Co-authored-by: Kai Yan <[email protected]> Co-authored-by: Ran Xiao <[email protected]>
1 parent a144bf2 commit b6bf402

14 files changed

+1904
-3
lines changed

llvm/include/llvm/CodeGen/MachinePipeliner.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,12 @@
4444
#include "llvm/CodeGen/DFAPacketizer.h"
4545
#include "llvm/CodeGen/MachineDominators.h"
4646
#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
47+
#include "llvm/CodeGen/MachineScheduler.h"
4748
#include "llvm/CodeGen/RegisterClassInfo.h"
4849
#include "llvm/CodeGen/ScheduleDAGInstrs.h"
4950
#include "llvm/CodeGen/ScheduleDAGMutation.h"
5051
#include "llvm/CodeGen/TargetInstrInfo.h"
52+
#include "llvm/CodeGen/WindowScheduler.h"
5153
#include "llvm/InitializePasses.h"
5254

5355
#include <deque>
@@ -107,6 +109,9 @@ class MachinePipeliner : public MachineFunctionPass {
107109
bool scheduleLoop(MachineLoop &L);
108110
bool swingModuloScheduler(MachineLoop &L);
109111
void setPragmaPipelineOptions(MachineLoop &L);
112+
bool runWindowScheduler(MachineLoop &L);
113+
bool useSwingModuloScheduler();
114+
bool useWindowScheduler(bool Changed);
110115
};
111116

112117
/// This class builds the dependence graph for the instructions in a loop,
@@ -449,7 +454,7 @@ class ResourceManager {
449454
const MCSchedModel &SM;
450455
const TargetSubtargetInfo *ST;
451456
const TargetInstrInfo *TII;
452-
SwingSchedulerDAG *DAG;
457+
ScheduleDAGInstrs *DAG;
453458
const bool UseDFA;
454459
/// DFA resources for each slot
455460
llvm::SmallVector<std::unique_ptr<DFAPacketizer>> DFAResources;
@@ -493,7 +498,7 @@ class ResourceManager {
493498
#endif
494499

495500
public:
496-
ResourceManager(const TargetSubtargetInfo *ST, SwingSchedulerDAG *DAG)
501+
ResourceManager(const TargetSubtargetInfo *ST, ScheduleDAGInstrs *DAG)
497502
: STI(ST), SM(ST->getSchedModel()), ST(ST), TII(ST->getInstrInfo()),
498503
DAG(DAG), UseDFA(ST->useDFAforSMS()),
499504
ProcResourceMasks(SM.getNumProcResourceKinds(), 0),

llvm/include/llvm/CodeGen/TargetSubtargetInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,9 @@ class TargetSubtargetInfo : public MCSubtargetInfo {
199199
/// True if the subtarget should run MachinePipeliner
200200
virtual bool enableMachinePipeliner() const { return true; };
201201

202+
/// True if the subtarget should run WindowScheduler.
203+
virtual bool enableWindowScheduler() const { return true; }
204+
202205
/// True if the subtarget should enable joining global copies.
203206
///
204207
/// By default this is enabled if the machine scheduler is enabled, but
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
//======----------- WindowScheduler.cpp - window scheduler -------------======//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// An implementation of the Window Scheduling software pipelining algorithm.
10+
//
11+
// The concept of the window algorithm was first unveiled in Steven Muchnick's
12+
// book, "Advanced Compiler Design And Implementation", and later elaborated
13+
// upon in Venkatraman Govindaraju's report, "Implementation of Software
14+
// Pipelining Using Window Scheduling".
15+
//
16+
// The window algorithm can be perceived as a modulo scheduling algorithm with a
17+
// stage count of 2. It boasts a higher scheduling success rate in targets with
18+
// severe resource conflicts when compared to the classic Swing Modulo
19+
// Scheduling (SMS) algorithm. To align with the LLVM scheduling framework, we
20+
// have enhanced the original window algorithm. The primary steps are as
21+
// follows:
22+
//
23+
// 1. Instead of duplicating the original MBB twice as mentioned in the
24+
// literature, we copy it three times, generating TripleMBB and the
25+
// corresponding TripleDAG.
26+
//
27+
// 2. We establish a scheduling window on TripleMBB and execute list scheduling
28+
// within it.
29+
//
30+
// 3. After multiple list scheduling, we select the best outcome and expand it
31+
// into the final scheduling result.
32+
//
33+
// To cater to the needs of various targets, we have developed the window
34+
// scheduler in a form that is easily derivable. We recommend employing this
35+
// algorithm in targets with severe resource conflicts, and it can be utilized
36+
// either before or after the Register Allocator (RA).
37+
//
38+
// The default implementation provided here is before RA. If it is to be used
39+
// after RA, certain critical algorithm functions will need to be derived.
40+
//
41+
//===----------------------------------------------------------------------===//
42+
#ifndef LLVM_CODEGEN_WINDOWSCHEDULER_H
43+
#define LLVM_CODEGEN_WINDOWSCHEDULER_H
44+
45+
#include "llvm/CodeGen/MachineLoopInfo.h"
46+
#include "llvm/CodeGen/MachineRegisterInfo.h"
47+
#include "llvm/CodeGen/MachineScheduler.h"
48+
#include "llvm/CodeGen/ScheduleDAGInstrs.h"
49+
#include "llvm/CodeGen/TargetSubtargetInfo.h"
50+
51+
namespace llvm {
52+
53+
enum WindowSchedulingFlag {
54+
WS_Off, /// Turn off window algorithm.
55+
WS_On, /// Use window algorithm after SMS algorithm fails.
56+
WS_Force /// Use window algorithm instead of SMS algorithm.
57+
};
58+
59+
/// The main class in the implementation of the target independent window
60+
/// scheduler.
61+
class WindowScheduler {
62+
protected:
63+
MachineSchedContext *Context = nullptr;
64+
MachineFunction *MF = nullptr;
65+
MachineBasicBlock *MBB = nullptr;
66+
MachineLoop &Loop;
67+
const TargetSubtargetInfo *Subtarget = nullptr;
68+
const TargetInstrInfo *TII = nullptr;
69+
const TargetRegisterInfo *TRI = nullptr;
70+
MachineRegisterInfo *MRI = nullptr;
71+
72+
/// To innovatively identify the dependencies between MIs across two trips, we
73+
/// construct a DAG for a new MBB, which is created by copying the original
74+
/// MBB three times. We refer to this new MBB as 'TripleMBB' and the
75+
/// corresponding DAG as 'TripleDAG'.
76+
/// If the dependencies are more than two trips, we avoid applying window
77+
/// algorithm by identifying successive phis in the old MBB.
78+
std::unique_ptr<ScheduleDAGInstrs> TripleDAG;
79+
/// OriMIs keeps the MIs removed from the original MBB.
80+
SmallVector<MachineInstr *> OriMIs;
81+
/// TriMIs keeps the MIs of TripleMBB, which is used to restore TripleMBB.
82+
SmallVector<MachineInstr *> TriMIs;
83+
/// TriToOri keeps the mappings between the MI clones in TripleMBB and their
84+
/// original MI.
85+
DenseMap<MachineInstr *, MachineInstr *> TriToOri;
86+
/// OriToCycle keeps the mappings between the original MI and its issue cycle.
87+
DenseMap<MachineInstr *, int> OriToCycle;
88+
/// SchedResult keeps the result of each list scheduling, and the format of
89+
/// the tuple is <MI pointer, Cycle, Stage, Order ID>.
90+
SmallVector<std::tuple<MachineInstr *, int, int, int>, 256> SchedResult;
91+
/// SchedPhiNum records the number of phi in the original MBB, and the
92+
/// scheduling starts with MI after phis.
93+
unsigned SchedPhiNum = 0;
94+
/// SchedInstrNum records the MIs involved in scheduling in the original MBB,
95+
/// excluding debug instructions.
96+
unsigned SchedInstrNum = 0;
97+
/// BestII and BestOffset record the characteristics of the best scheduling
98+
/// result and are used together with SchedResult as the final window
99+
/// scheduling result.
100+
unsigned BestII = UINT_MAX;
101+
unsigned BestOffset = 0;
102+
/// BaseII is the II obtained when the window offset is SchedPhiNum. This
103+
/// offset is the initial position of the sliding window.
104+
unsigned BaseII = 0;
105+
106+
public:
107+
WindowScheduler(MachineSchedContext *C, MachineLoop &ML);
108+
virtual ~WindowScheduler() {}
109+
110+
bool run();
111+
112+
protected:
113+
/// Two types of ScheduleDAGs are needed, one for creating dependency graphs
114+
/// only, and the other for list scheduling as determined by the target.
115+
virtual ScheduleDAGInstrs *
116+
createMachineScheduler(bool OnlyBuildGraph = false);
117+
/// Initializes the algorithm and determines if it can be executed.
118+
virtual bool initialize();
119+
/// Add some related processing before running window scheduling.
120+
virtual void preProcess();
121+
/// Add some related processing after running window scheduling.
122+
virtual void postProcess();
123+
/// Back up the MIs in the original MBB and remove them from MBB.
124+
void backupMBB();
125+
/// Erase the MIs in current MBB and restore the original MIs.
126+
void restoreMBB();
127+
/// Make three copies of the original MBB to generate a new TripleMBB.
128+
virtual void generateTripleMBB();
129+
/// Restore the order of MIs in TripleMBB after each list scheduling.
130+
virtual void restoreTripleMBB();
131+
/// Give the folding position in the window algorithm, where different
132+
/// heuristics can be used. It determines the performance and compilation time
133+
/// of the algorithm.
134+
virtual SmallVector<unsigned> getSearchIndexes(unsigned SearchNum,
135+
unsigned SearchRatio);
136+
/// Calculate MIs execution cycle after list scheduling.
137+
virtual int calculateMaxCycle(ScheduleDAGInstrs &DAG, unsigned Offset);
138+
/// Calculate the stall cycle between two trips after list scheduling.
139+
virtual int calculateStallCycle(unsigned Offset, int MaxCycle);
140+
/// Analyzes the II value after each list scheduling.
141+
virtual unsigned analyseII(ScheduleDAGInstrs &DAG, unsigned Offset);
142+
/// Phis are scheduled separately after each list scheduling.
143+
virtual void schedulePhi(int Offset, unsigned &II);
144+
/// Get the final issue order of all scheduled MIs including phis.
145+
DenseMap<MachineInstr *, int> getIssueOrder(unsigned Offset, unsigned II);
146+
/// Update the scheduling result after each list scheduling.
147+
virtual void updateScheduleResult(unsigned Offset, unsigned II);
148+
/// Check whether the final result of window scheduling is valid.
149+
virtual bool isScheduleValid() { return BestOffset != SchedPhiNum; }
150+
/// Using the scheduling infrastructure to expand the results of window
151+
/// scheduling. It is usually necessary to add prologue and epilogue MBBs.
152+
virtual void expand();
153+
/// Update the live intervals for all registers used within MBB.
154+
virtual void updateLiveIntervals();
155+
/// Estimate a II value at which all MIs will be scheduled successfully.
156+
int getEstimatedII(ScheduleDAGInstrs &DAG);
157+
/// Gets the iterator range of MIs in the scheduling window.
158+
iterator_range<MachineBasicBlock::iterator> getScheduleRange(unsigned Offset,
159+
unsigned Num);
160+
/// Get the issue cycle of the new MI based on the cycle of the original MI.
161+
int getOriCycle(MachineInstr *NewMI);
162+
/// Get the original MI from which the new MI is cloned.
163+
MachineInstr *getOriMI(MachineInstr *NewMI);
164+
/// Get the scheduling stage, where the stage of the new MI is identical to
165+
/// the original MI.
166+
unsigned getOriStage(MachineInstr *OriMI, unsigned Offset);
167+
/// Gets the register in phi which is generated from the current MBB.
168+
Register getAntiRegister(MachineInstr *Phi);
169+
};
170+
} // namespace llvm
171+
#endif

llvm/lib/CodeGen/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,7 @@ add_llvm_component_library(LLVMCodeGen
244244
VLIWMachineScheduler.cpp
245245
VirtRegMap.cpp
246246
WasmEHPrepare.cpp
247+
WindowScheduler.cpp
247248
WinEHPrepare.cpp
248249
XRayInstrumentation.cpp
249250
${GeneratedMLSources}

llvm/lib/CodeGen/MachinePipeliner.cpp

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@
6868
#include "llvm/CodeGen/ScheduleDAGMutation.h"
6969
#include "llvm/CodeGen/TargetInstrInfo.h"
7070
#include "llvm/CodeGen/TargetOpcodes.h"
71+
#include "llvm/CodeGen/TargetPassConfig.h"
7172
#include "llvm/CodeGen/TargetRegisterInfo.h"
7273
#include "llvm/CodeGen/TargetSubtargetInfo.h"
7374
#include "llvm/Config/llvm-config.h"
@@ -210,6 +211,17 @@ cl::opt<int> SwpForceIssueWidth(
210211
cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
211212
cl::init(-1));
212213

214+
/// A command line argument to set the window scheduling option.
215+
cl::opt<WindowSchedulingFlag> WindowSchedulingOption(
216+
"window-sched", cl::Hidden, cl::init(WindowSchedulingFlag::WS_On),
217+
cl::desc("Set how to use window scheduling algorithm."),
218+
cl::values(clEnumValN(WindowSchedulingFlag::WS_Off, "off",
219+
"Turn off window algorithm."),
220+
clEnumValN(WindowSchedulingFlag::WS_On, "on",
221+
"Use window algorithm after SMS algorithm fails."),
222+
clEnumValN(WindowSchedulingFlag::WS_Force, "force",
223+
"Use window algorithm instead of SMS algorithm.")));
224+
213225
} // end namespace llvm
214226

215227
unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
@@ -296,8 +308,11 @@ bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
296308
}
297309

298310
++NumTrytoPipeline;
311+
if (useSwingModuloScheduler())
312+
Changed = swingModuloScheduler(L);
299313

300-
Changed = swingModuloScheduler(L);
314+
if (useWindowScheduler(Changed))
315+
Changed = runWindowScheduler(L);
301316

302317
LI.LoopPipelinerInfo.reset();
303318
return Changed;
@@ -488,9 +503,35 @@ void MachinePipeliner::getAnalysisUsage(AnalysisUsage &AU) const {
488503
AU.addRequired<MachineDominatorTreeWrapperPass>();
489504
AU.addRequired<LiveIntervals>();
490505
AU.addRequired<MachineOptimizationRemarkEmitterPass>();
506+
AU.addRequired<TargetPassConfig>();
491507
MachineFunctionPass::getAnalysisUsage(AU);
492508
}
493509

510+
bool MachinePipeliner::runWindowScheduler(MachineLoop &L) {
511+
MachineSchedContext Context;
512+
Context.MF = MF;
513+
Context.MLI = MLI;
514+
Context.MDT = MDT;
515+
Context.PassConfig = &getAnalysis<TargetPassConfig>();
516+
Context.AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
517+
Context.LIS = &getAnalysis<LiveIntervals>();
518+
Context.RegClassInfo->runOnMachineFunction(*MF);
519+
WindowScheduler WS(&Context, L);
520+
return WS.run();
521+
}
522+
523+
bool MachinePipeliner::useSwingModuloScheduler() {
524+
// SwingModuloScheduler does not work when WindowScheduler is forced.
525+
return WindowSchedulingOption != WindowSchedulingFlag::WS_Force;
526+
}
527+
528+
bool MachinePipeliner::useWindowScheduler(bool Changed) {
529+
// WindowScheduler does not work when it is off or when SwingModuloScheduler
530+
// is successfully scheduled.
531+
return WindowSchedulingOption == WindowSchedulingFlag::WS_Force ||
532+
(WindowSchedulingOption == WindowSchedulingFlag::WS_On && !Changed);
533+
}
534+
494535
void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
495536
if (SwpForceII > 0)
496537
MII = SwpForceII;

0 commit comments

Comments
 (0)