Skip to content

Commit 373d9d7

Browse files
dtcxzywSForeKeeper
andauthored
[RISCV] Add sched model for XiangShan-NanHu (#70232)
[XiangShan](https://github.com/OpenXiangShan/XiangShan) is an open-source high-performance RISC-V processor. This PR adds the schedule model for XiangShan-NanHu, the 2nd Gen core of the XiangShan processor series. Overview: https://xiangshan-doc.readthedocs.io/zh-cn/latest/integration/overview/ It is based on the patch [D122556](https://reviews.llvm.org/D122556) by @SForeKeeper. The original patch hasn't been updated for a long time and it is out of sync with the current RTL design. --------- Co-authored-by: SForeKeeper <[email protected]>
1 parent 0091893 commit 373d9d7

File tree

6 files changed

+968
-2
lines changed

6 files changed

+968
-2
lines changed

llvm/lib/Target/RISCV/RISCV.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ include "RISCVSchedRocket.td"
4444
include "RISCVSchedSiFive7.td"
4545
include "RISCVSchedSiFiveP400.td"
4646
include "RISCVSchedSyntacoreSCR1.td"
47+
include "RISCVSchedXiangShanNanHu.td"
4748

4849
//===----------------------------------------------------------------------===//
4950
// RISC-V processors supported.

llvm/lib/Target/RISCV/RISCVProcessors.td

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,7 @@ def VENTANA_VEYRON_V1 : RISCVProcessorModel<"veyron-v1",
330330
TuneLDADDFusion]>;
331331

332332
def XIANGSHAN_NANHU : RISCVProcessorModel<"xiangshan-nanhu",
333-
NoSchedModel,
333+
XiangShanNanHuModel,
334334
[Feature64Bit,
335335
FeatureStdExtZicsr,
336336
FeatureStdExtZifencei,
@@ -348,4 +348,8 @@ def XIANGSHAN_NANHU : RISCVProcessorModel<"xiangshan-nanhu",
348348
FeatureStdExtZksh,
349349
FeatureStdExtSvinval,
350350
FeatureStdExtZicbom,
351-
FeatureStdExtZicboz]>;
351+
FeatureStdExtZicboz],
352+
[TuneNoDefaultUnroll,
353+
TuneZExtHFusion,
354+
TuneZExtWFusion,
355+
TuneShiftedZExtWFusion]>;
Lines changed: 308 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,308 @@
1+
//==- RISCVSchedXiangShanNanHu.td - XS-NanHu Scheduling Defs -*- tablegen -*-=//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
//===----------------------------------------------------------------------===//
10+
11+
// XiangShan is a high-performance open-source RISC-V processor developed by
12+
// the Institute of Computing Technology (ICT), Chinese Academy of Sciences.
13+
// Source: https://github.com/OpenXiangShan/XiangShan
14+
// Documentation: https://github.com/OpenXiangShan/XiangShan-doc
15+
16+
// XiangShan-NanHu is the second generation of XiangShan processor series.
17+
// Overview: https://xiangshan-doc.readthedocs.io/zh-cn/latest/integration/overview/
18+
19+
def XiangShanNanHuModel : SchedMachineModel {
20+
let MicroOpBufferSize = 256;
21+
let LoopMicroOpBufferSize = 48; // Instruction queue size
22+
let IssueWidth = 6; // 6-way decode and dispatch
23+
let LoadLatency = 4;
24+
let MispredictPenalty = 11; // Based on estimate of pipeline depth.
25+
let CompleteModel = 0;
26+
let UnsupportedFeatures = [HasStdExtZcmt, HasStdExtZkr, HasVInstructions,
27+
HasVInstructionsI64];
28+
}
29+
30+
let SchedModel = XiangShanNanHuModel in {
31+
32+
// The reservation stations are distributed and grouped as 32-entry or 16-entry smaller ones.
33+
let BufferSize = 16 in {
34+
def XS2ALU : ProcResource<4>;
35+
def XS2MDU : ProcResource<2>;
36+
def XS2MISC : ProcResource<1>;
37+
38+
def XS2FMAC : ProcResource<4>;
39+
def XS2FMISC : ProcResource<2>;
40+
41+
// Load/Store queues are ignored.
42+
def XS2LD : ProcResource<2>;
43+
def XS2ST : ProcResource<2>;
44+
}
45+
46+
// Branching
47+
def : WriteRes<WriteJmp, [XS2MISC]>;
48+
def : WriteRes<WriteJal, [XS2MISC]>;
49+
def : WriteRes<WriteJalr, [XS2MISC]>;
50+
51+
// Integer arithmetic and logic
52+
let Latency = 1 in {
53+
def : WriteRes<WriteIALU, [XS2ALU]>;
54+
def : WriteRes<WriteIALU32, [XS2ALU]>;
55+
def : WriteRes<WriteShiftImm, [XS2ALU]>;
56+
def : WriteRes<WriteShiftImm32, [XS2ALU]>;
57+
def : WriteRes<WriteShiftReg, [XS2ALU]>;
58+
def : WriteRes<WriteShiftReg32, [XS2ALU]>;
59+
}
60+
61+
// Integer multiplication
62+
let Latency = 3 in {
63+
def : WriteRes<WriteIMul, [XS2MDU]>;
64+
def : WriteRes<WriteIMul32, [XS2MDU]>;
65+
}
66+
67+
// Integer division
68+
// SRT16 algorithm
69+
let Latency = 20, ReleaseAtCycles = [20] in {
70+
def : WriteRes<WriteIDiv32, [XS2MDU]>;
71+
def : WriteRes<WriteIDiv, [XS2MDU]>;
72+
}
73+
74+
// Zb*
75+
let Latency = 1 in {
76+
// Zba
77+
def : WriteRes<WriteSHXADD, [XS2ALU]>;
78+
def : WriteRes<WriteSHXADD32, [XS2ALU]>;
79+
80+
// Zbb
81+
def : WriteRes<WriteRotateImm, [XS2ALU]>;
82+
def : WriteRes<WriteRotateImm32, [XS2ALU]>;
83+
def : WriteRes<WriteRotateReg, [XS2ALU]>;
84+
def : WriteRes<WriteRotateReg32, [XS2ALU]>;
85+
def : WriteRes<WriteORCB, [XS2ALU]>;
86+
def : WriteRes<WriteREV8, [XS2ALU]>;
87+
88+
// Zbkb
89+
def : WriteRes<WriteBREV8, [XS2ALU]>;
90+
def : WriteRes<WritePACK, [XS2ALU]>;
91+
def : WriteRes<WritePACK32, [XS2ALU]>;
92+
def : WriteRes<WriteZIP, [XS2ALU]>;
93+
94+
// Zbs
95+
def : WriteRes<WriteSingleBit, [XS2ALU]>;
96+
def : WriteRes<WriteSingleBitImm, [XS2ALU]>;
97+
def : WriteRes<WriteBEXT, [XS2ALU]>;
98+
def : WriteRes<WriteBEXTI, [XS2ALU]>;
99+
}
100+
101+
let Latency = 3 in {
102+
// Zbb
103+
def : WriteRes<WriteCLZ, [XS2MDU]>;
104+
def : WriteRes<WriteCLZ32, [XS2MDU]>;
105+
def : WriteRes<WriteCTZ, [XS2MDU]>;
106+
def : WriteRes<WriteCTZ32, [XS2MDU]>;
107+
def : WriteRes<WriteCPOP, [XS2MDU]>;
108+
def : WriteRes<WriteCPOP32, [XS2MDU]>;
109+
110+
// Zbkc
111+
def : WriteRes<WriteCLMUL, [XS2MDU]>;
112+
113+
// Zbkx
114+
def : WriteRes<WriteXPERM, [XS2MDU]>;
115+
}
116+
117+
// Memory
118+
def : WriteRes<WriteSTB, [XS2ST]>;
119+
def : WriteRes<WriteSTH, [XS2ST]>;
120+
def : WriteRes<WriteSTW, [XS2ST]>;
121+
def : WriteRes<WriteSTD, [XS2ST]>;
122+
def : WriteRes<WriteFST32, [XS2ST]>;
123+
def : WriteRes<WriteFST64, [XS2ST]>;
124+
def : WriteRes<WriteAtomicSTW, [XS2ST]>;
125+
def : WriteRes<WriteAtomicSTD, [XS2ST]>;
126+
127+
let Latency = 5 in {
128+
def : WriteRes<WriteLDB, [XS2LD]>;
129+
def : WriteRes<WriteLDH, [XS2LD]>;
130+
def : WriteRes<WriteLDW, [XS2LD]>;
131+
def : WriteRes<WriteLDD, [XS2LD]>;
132+
133+
def : WriteRes<WriteAtomicW, [XS2LD]>;
134+
def : WriteRes<WriteAtomicD, [XS2LD]>;
135+
def : WriteRes<WriteAtomicLDW, [XS2LD]>;
136+
def : WriteRes<WriteAtomicLDD, [XS2LD]>;
137+
138+
def : WriteRes<WriteFLD32, [XS2LD]>;
139+
def : WriteRes<WriteFLD64, [XS2LD]>;
140+
}
141+
142+
// XiangShan-NanHu uses FuDian FPU instead of Berkeley HardFloat.
143+
// Documentation: https://github.com/OpenXiangShan/fudian
144+
145+
let Latency = 3 in {
146+
def : WriteRes<WriteFAdd32, [XS2FMAC]>;
147+
def : WriteRes<WriteFSGNJ32, [XS2FMAC]>;
148+
def : WriteRes<WriteFMinMax32, [XS2FMAC]>;
149+
def : WriteRes<WriteFAdd64, [XS2FMAC]>;
150+
def : WriteRes<WriteFSGNJ64, [XS2FMAC]>;
151+
def : WriteRes<WriteFMinMax64, [XS2FMAC]>;
152+
153+
def : WriteRes<WriteFCvtI32ToF32, [XS2FMAC]>;
154+
def : WriteRes<WriteFCvtI32ToF64, [XS2FMAC]>;
155+
def : WriteRes<WriteFCvtI64ToF32, [XS2FMAC]>;
156+
def : WriteRes<WriteFCvtI64ToF64, [XS2FMAC]>;
157+
def : WriteRes<WriteFCvtF32ToI32, [XS2FMAC]>;
158+
def : WriteRes<WriteFCvtF32ToI64, [XS2FMAC]>;
159+
def : WriteRes<WriteFCvtF64ToI32, [XS2FMAC]>;
160+
def : WriteRes<WriteFCvtF64ToI64, [XS2FMAC]>;
161+
def : WriteRes<WriteFCvtF32ToF64, [XS2FMAC]>;
162+
def : WriteRes<WriteFCvtF64ToF32, [XS2FMAC]>;
163+
164+
def : WriteRes<WriteFClass32, [XS2FMAC]>;
165+
def : WriteRes<WriteFClass64, [XS2FMAC]>;
166+
def : WriteRes<WriteFCmp32, [XS2FMAC]>;
167+
def : WriteRes<WriteFCmp64, [XS2FMAC]>;
168+
def : WriteRes<WriteFMovF32ToI32, [XS2FMAC]>;
169+
def : WriteRes<WriteFMovI32ToF32, [XS2FMAC]>;
170+
def : WriteRes<WriteFMovF64ToI64, [XS2FMAC]>;
171+
def : WriteRes<WriteFMovI64ToF64, [XS2FMAC]>;
172+
}
173+
174+
// FP multiplication
175+
let Latency = 3 in {
176+
def : WriteRes<WriteFMul32, [XS2FMAC]>;
177+
def : WriteRes<WriteFMul64, [XS2FMAC]>;
178+
}
179+
180+
let Latency = 5 in {
181+
def : WriteRes<WriteFMA32, [XS2FMAC]>;
182+
def : WriteRes<WriteFMA64, [XS2FMAC]>;
183+
}
184+
185+
// FP division
186+
def : WriteRes<WriteFDiv32, [XS2FMISC]> {
187+
let Latency = 11;
188+
}
189+
def : WriteRes<WriteFDiv64, [XS2FMISC]> {
190+
let Latency = 18;
191+
}
192+
193+
def : WriteRes<WriteFSqrt32, [XS2FMISC]> {
194+
let Latency = 17;
195+
}
196+
def : WriteRes<WriteFSqrt64, [XS2FMISC]> {
197+
let Latency = 31;
198+
}
199+
200+
// Others
201+
def : WriteRes<WriteCSR, [XS2MISC]>;
202+
def : WriteRes<WriteNop, []>;
203+
204+
def : InstRW<[WriteIALU], (instrs COPY)>;
205+
206+
// Bypass and advance
207+
208+
class XS2LoadToALUBypass<SchedRead read>
209+
: ReadAdvance<read, 1, [WriteLDB, WriteLDH, WriteLDW, WriteLDD, WriteAtomicW, WriteAtomicD, WriteAtomicLDW, WriteAtomicLDD]>;
210+
211+
def : ReadAdvance<ReadJmp, 0>;
212+
def : ReadAdvance<ReadJalr, 0>;
213+
def : ReadAdvance<ReadCSR, 0>;
214+
def : ReadAdvance<ReadStoreData, 0>;
215+
def : ReadAdvance<ReadMemBase, 0>;
216+
def : XS2LoadToALUBypass<ReadIALU>;
217+
def : XS2LoadToALUBypass<ReadIALU32>;
218+
def : XS2LoadToALUBypass<ReadShiftImm>;
219+
def : XS2LoadToALUBypass<ReadShiftImm32>;
220+
def : XS2LoadToALUBypass<ReadShiftReg>;
221+
def : XS2LoadToALUBypass<ReadShiftReg32>;
222+
def : ReadAdvance<ReadIDiv, 0>;
223+
def : ReadAdvance<ReadIDiv32, 0>;
224+
def : ReadAdvance<ReadIMul, 0>;
225+
def : ReadAdvance<ReadIMul32, 0>;
226+
def : ReadAdvance<ReadAtomicWA, 0>;
227+
def : ReadAdvance<ReadAtomicWD, 0>;
228+
def : ReadAdvance<ReadAtomicDA, 0>;
229+
def : ReadAdvance<ReadAtomicDD, 0>;
230+
def : ReadAdvance<ReadAtomicLDW, 0>;
231+
def : ReadAdvance<ReadAtomicLDD, 0>;
232+
def : ReadAdvance<ReadAtomicSTW, 0>;
233+
def : ReadAdvance<ReadAtomicSTD, 0>;
234+
def : ReadAdvance<ReadFStoreData, 0>;
235+
def : ReadAdvance<ReadFMemBase, 0>;
236+
def : ReadAdvance<ReadFAdd32, 0>;
237+
def : ReadAdvance<ReadFAdd64, 0>;
238+
def : ReadAdvance<ReadFMul32, 0>;
239+
def : ReadAdvance<ReadFMul64, 0>;
240+
def : ReadAdvance<ReadFMA32, 0>;
241+
def : ReadAdvance<ReadFMA32Addend, 2>; // Cascade FMA
242+
def : ReadAdvance<ReadFMA64, 0>;
243+
def : ReadAdvance<ReadFMA64Addend, 2>; // Cascade FMA
244+
def : ReadAdvance<ReadFDiv32, 0>;
245+
def : ReadAdvance<ReadFDiv64, 0>;
246+
def : ReadAdvance<ReadFSqrt32, 0>;
247+
def : ReadAdvance<ReadFSqrt64, 0>;
248+
def : ReadAdvance<ReadFCmp32, 0>;
249+
def : ReadAdvance<ReadFCmp64, 0>;
250+
def : ReadAdvance<ReadFSGNJ32, 0>;
251+
def : ReadAdvance<ReadFSGNJ64, 0>;
252+
def : ReadAdvance<ReadFMinMax32, 0>;
253+
def : ReadAdvance<ReadFMinMax64, 0>;
254+
def : ReadAdvance<ReadFCvtF32ToI32, 0>;
255+
def : ReadAdvance<ReadFCvtF32ToI64, 0>;
256+
def : ReadAdvance<ReadFCvtF64ToI32, 0>;
257+
def : ReadAdvance<ReadFCvtF64ToI64, 0>;
258+
def : ReadAdvance<ReadFCvtI32ToF32, 0>;
259+
def : ReadAdvance<ReadFCvtI32ToF64, 0>;
260+
def : ReadAdvance<ReadFCvtI64ToF32, 0>;
261+
def : ReadAdvance<ReadFCvtI64ToF64, 0>;
262+
def : ReadAdvance<ReadFCvtF32ToF64, 0>;
263+
def : ReadAdvance<ReadFCvtF64ToF32, 0>;
264+
def : ReadAdvance<ReadFMovF32ToI32, 0>;
265+
def : ReadAdvance<ReadFMovI32ToF32, 0>;
266+
def : ReadAdvance<ReadFMovF64ToI64, 0>;
267+
def : ReadAdvance<ReadFMovI64ToF64, 0>;
268+
def : ReadAdvance<ReadFClass32, 0>;
269+
def : ReadAdvance<ReadFClass64, 0>;
270+
271+
// Zb*
272+
// Zba
273+
def : XS2LoadToALUBypass<ReadSHXADD>;
274+
def : XS2LoadToALUBypass<ReadSHXADD32>;
275+
// Zbb
276+
def : XS2LoadToALUBypass<ReadRotateImm>;
277+
def : XS2LoadToALUBypass<ReadRotateImm32>;
278+
def : XS2LoadToALUBypass<ReadRotateReg>;
279+
def : XS2LoadToALUBypass<ReadRotateReg32>;
280+
def : ReadAdvance<ReadCLZ, 0>;
281+
def : ReadAdvance<ReadCLZ32, 0>;
282+
def : ReadAdvance<ReadCTZ, 0>;
283+
def : ReadAdvance<ReadCTZ32, 0>;
284+
def : ReadAdvance<ReadCPOP, 0>;
285+
def : ReadAdvance<ReadCPOP32, 0>;
286+
def : XS2LoadToALUBypass<ReadORCB>;
287+
def : XS2LoadToALUBypass<ReadREV8>;
288+
// Zbkc
289+
def : ReadAdvance<ReadCLMUL, 0>;
290+
// Zbs
291+
def : XS2LoadToALUBypass<ReadSingleBit>;
292+
def : XS2LoadToALUBypass<ReadSingleBitImm>;
293+
// Zbkb
294+
def : XS2LoadToALUBypass<ReadBREV8>;
295+
def : XS2LoadToALUBypass<ReadPACK>;
296+
def : XS2LoadToALUBypass<ReadPACK32>;
297+
def : XS2LoadToALUBypass<ReadZIP>;
298+
// Zbkx
299+
def : ReadAdvance<ReadXPERM, 0>;
300+
301+
//===----------------------------------------------------------------------===//
302+
// Unsupported extensions
303+
defm : UnsupportedSchedV;
304+
defm : UnsupportedSchedZfa;
305+
defm : UnsupportedSchedZfh;
306+
defm : UnsupportedSchedSFB;
307+
defm : UnsupportedSchedZabha;
308+
}
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
2+
# RUN: llvm-mca -mtriple=riscv64 -mcpu=xiangshan-nanhu < %s | FileCheck %s
3+
4+
# Test XiangShan FuDian's cascade FMA, CPI = 3
5+
fmadd.s fa0, fa1, fa2, fa0
6+
7+
# CHECK: Iterations: 100
8+
# CHECK-NEXT: Instructions: 100
9+
# CHECK-NEXT: Total Cycles: 305
10+
# CHECK-NEXT: Total uOps: 100
11+
12+
# CHECK: Dispatch Width: 6
13+
# CHECK-NEXT: uOps Per Cycle: 0.33
14+
# CHECK-NEXT: IPC: 0.33
15+
# CHECK-NEXT: Block RThroughput: 0.3
16+
17+
# CHECK: Instruction Info:
18+
# CHECK-NEXT: [1]: #uOps
19+
# CHECK-NEXT: [2]: Latency
20+
# CHECK-NEXT: [3]: RThroughput
21+
# CHECK-NEXT: [4]: MayLoad
22+
# CHECK-NEXT: [5]: MayStore
23+
# CHECK-NEXT: [6]: HasSideEffects (U)
24+
25+
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
26+
# CHECK-NEXT: 1 5 0.25 fmadd.s fa0, fa1, fa2, fa0
27+
28+
# CHECK: Resources:
29+
# CHECK-NEXT: [0.0] - XS2ALU
30+
# CHECK-NEXT: [0.1] - XS2ALU
31+
# CHECK-NEXT: [0.2] - XS2ALU
32+
# CHECK-NEXT: [0.3] - XS2ALU
33+
# CHECK-NEXT: [1.0] - XS2FMAC
34+
# CHECK-NEXT: [1.1] - XS2FMAC
35+
# CHECK-NEXT: [1.2] - XS2FMAC
36+
# CHECK-NEXT: [1.3] - XS2FMAC
37+
# CHECK-NEXT: [2.0] - XS2FMISC
38+
# CHECK-NEXT: [2.1] - XS2FMISC
39+
# CHECK-NEXT: [3.0] - XS2LD
40+
# CHECK-NEXT: [3.1] - XS2LD
41+
# CHECK-NEXT: [4.0] - XS2MDU
42+
# CHECK-NEXT: [4.1] - XS2MDU
43+
# CHECK-NEXT: [5] - XS2MISC
44+
# CHECK-NEXT: [6.0] - XS2ST
45+
# CHECK-NEXT: [6.1] - XS2ST
46+
47+
# CHECK: Resource pressure per iteration:
48+
# CHECK-NEXT: [0.0] [0.1] [0.2] [0.3] [1.0] [1.1] [1.2] [1.3] [2.0] [2.1] [3.0] [3.1] [4.0] [4.1] [5] [6.0] [6.1]
49+
# CHECK-NEXT: - - - - 0.25 0.25 0.25 0.25 - - - - - - - - -
50+
51+
# CHECK: Resource pressure by instruction:
52+
# CHECK-NEXT: [0.0] [0.1] [0.2] [0.3] [1.0] [1.1] [1.2] [1.3] [2.0] [2.1] [3.0] [3.1] [4.0] [4.1] [5] [6.0] [6.1] Instructions:
53+
# CHECK-NEXT: - - - - 0.25 0.25 0.25 0.25 - - - - - - - - - fmadd.s fa0, fa1, fa2, fa0

0 commit comments

Comments
 (0)