Skip to content

Commit ea015d4

Browse files
admitricigcbot
authored andcommitted
Improve register estimation for CodeScheduling
- Fix the incorrect estimation of the initial register pressure - Support more special cases for code with various casts in the CodeScheduling's RegisterPressureTracker
1 parent c700d6b commit ea015d4

File tree

4 files changed

+185
-51
lines changed

4 files changed

+185
-51
lines changed

IGC/Compiler/CISACodeGen/CodeScheduling.cpp

Lines changed: 69 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -459,7 +459,9 @@ class RegisterPressureTracker {
459459
if (!I)
460460
return V;
461461

462-
if (isNoOpInst(I, CTX)) {
462+
bool IsAddrSpaceCast = isa<AddrSpaceCastInst>(I);
463+
464+
if (isNoOpInst(I, CTX) || IsAddrSpaceCast) {
463465
return getRealOp(I->getOperand(0));
464466
}
465467
return V;
@@ -568,7 +570,10 @@ class RegisterPressureTracker {
568570
}
569571

570572
int32_t estimateOrUpdateImpl(Instruction *I, bool Update) {
571-
if (IGCLLVM::isDebugOrPseudoInst(*I) || I->isLifetimeStartOrEnd() || isNoOpInst(I, CTX)) {
573+
auto *Intr = dyn_cast<GenIntrinsicInst>(I);
574+
bool IsNoOpIntr = Intr && (Intr->getIntrinsicID() == GenISAIntrinsic::GenISA_ptr_to_pair);
575+
576+
if (IGCLLVM::isDebugOrPseudoInst(*I) || I->isLifetimeStartOrEnd() || isNoOpInst(I, CTX) || IsNoOpIntr) {
572577
// NoOp instructions do not change register pressure
573578
if (Update)
574579
PrintDumpLevel(VerbosityLevel::High, "NoOp instruction: " << getName(I) << "\n");
@@ -993,8 +998,20 @@ class BBScheduler {
993998

994999
int32_t MaxOriginalRegpressure = 0;
9951000
bool OriginalScheduleCanHaveSpills = false;
1001+
1002+
PrintDump("Original schedule: " << BBName << "\n");
9961003
for (auto &I : *BB) {
997-
RPT.update(&I);
1004+
std::string Info;
1005+
if (isa<PHINode>(&I)) {
1006+
// PHIs are already included in the initial regpressure
1007+
Info = formatDebugInfo(RPT.getCurrentPressure(), 0, "Phi", getVectorShuffleString(&I, VSA, RCA));
1008+
} else {
1009+
int32_t Estimate = RPT.update(&I);
1010+
Info = formatDebugInfo(RPT.getCurrentPressure(), Estimate, "OG", getVectorShuffleString(&I, VSA, RCA));
1011+
}
1012+
PrintDump(Info);
1013+
PrintInstructionDump(&I);
1014+
9981015
MaxOriginalRegpressure = std::max(MaxOriginalRegpressure, RPT.getCurrentPressure());
9991016
if (RPT.isRegpressureCritical()) {
10001017
OriginalScheduleCanHaveSpills = true;
@@ -1202,6 +1219,45 @@ class BBScheduler {
12021219
SchedulingConfig &C;
12031220
llvm::raw_ostream *LogStream;
12041221

1222+
// Helper function to format debug information string
1223+
static std::string formatDebugInfo(int32_t CurrentPressure, int32_t Estimate, const std::string Type,
1224+
const std::string AddString = "") {
1225+
const int ESTIMATION_NUMBERS_WIDTH = 12;
1226+
const int INFO_WIDTH = 20;
1227+
std::string Info = std::to_string(CurrentPressure) + ", " + std::to_string(Estimate);
1228+
Info.resize(ESTIMATION_NUMBERS_WIDTH, ' ');
1229+
Info = "(" + Info + ") " + Type + ": ";
1230+
Info.resize(INFO_WIDTH, ' ');
1231+
1232+
if (!AddString.empty()) {
1233+
Info += AddString;
1234+
}
1235+
1236+
return Info;
1237+
}
1238+
1239+
// Helper function to get vector shuffle string
1240+
static std::string getVectorShuffleString(Instruction *I, VectorShuffleAnalysis *VSA, RematChainsAnalysis *RCA) {
1241+
auto *DT = VSA->getDestVector(I);
1242+
auto *V2SP = VSA->getVectorToScalarsPattern(I);
1243+
auto *RCP = RCA->getRematChainPattern(I);
1244+
1245+
std::string VS_String = " ";
1246+
if (RCP) {
1247+
VS_String = "REM ";
1248+
} else if (DT && DT->isNoOp()) {
1249+
VS_String = "NOP ";
1250+
} else if (DT && DT->isVectorShuffle()) {
1251+
VS_String = "VS ";
1252+
} else if (DT && !DT->isVectorShuffle()) {
1253+
VS_String = "SCA ";
1254+
} else if (V2SP) {
1255+
VS_String = "V2S ";
1256+
}
1257+
1258+
return VS_String;
1259+
}
1260+
12051261
class InstructionNode {
12061262
public:
12071263
InstructionNode(Instruction *I, uint32_t N) : I(I), OriginalPosition(N) {
@@ -1222,8 +1278,9 @@ class BBScheduler {
12221278

12231279
void print(llvm::raw_ostream &LogStream) {
12241280
if (IGC_IS_FLAG_ENABLED(DumpCodeScheduling)) {
1225-
std::string Info = "Node #" + std::to_string(OriginalPosition) + ", MW: " + std::to_string(MaxWeight) + " ";
1226-
Info.resize(23, ' ');
1281+
const int INFO_WIDTH = 16;
1282+
std::string Info = "#" + std::to_string(OriginalPosition) + ", MW: " + std::to_string(MaxWeight) + " ";
1283+
Info.resize(INFO_WIDTH, ' ');
12271284
LogStream << Info;
12281285
I->print(LogStream);
12291286
LogStream << "\n";
@@ -2335,31 +2392,8 @@ class BBScheduler {
23352392
}
23362393
}
23372394

2338-
std::string Info = std::to_string(RT.getCurrentPressure()) + ", " + std::to_string(RT.estimate(Node->I));
2339-
Info.resize(11, ' ');
2340-
Info = "(" + Info + ") Im: ";
2341-
Info.resize(20, ' ');
2342-
2343-
auto *V2SP = VSA->getVectorToScalarsPattern(Node->I);
2344-
auto *RCP = RCA->getRematChainPattern(Node->I);
2345-
2346-
if (RCP) {
2347-
VS_String = "REM";
2348-
}
2349-
if (DT && DT->isVectorShuffle()) {
2350-
VS_String = "VS ";
2351-
}
2352-
if (DT && !DT->isVectorShuffle()) {
2353-
VS_String = "SCA";
2354-
}
2355-
if (DT && DT->isNoOp()) {
2356-
VS_String = "NOP";
2357-
}
2358-
if (V2SP) {
2359-
VS_String = "V2S";
2360-
}
2361-
2362-
Info += VS_String + " ";
2395+
std::string Info = formatDebugInfo(
2396+
RT.getCurrentPressure(), RT.estimate(Node->I), "Im", getVectorShuffleString(Node->I, VSA, RCA));
23632397

23642398
PrintDump(Info);
23652399
Node->print(*LogStream);
@@ -2476,27 +2510,11 @@ class BBScheduler {
24762510
AllInstructionsScheduledByRP = false;
24772511
}
24782512

2479-
// Dump the info
2480-
std::string Info = std::to_string(RT.getCurrentPressure()) + ", " + std::to_string(RT.estimate(Node->I));
2481-
Info.resize(11, ' ');
2482-
Info = "(" + Info + ") " + (ChooseByRP ? "RP" : "MW") + ": ";
2483-
Info.resize(20, ' ');
2484-
2485-
auto *DT = VSA->getDestVector(Node->I);
2486-
2487-
std::string VS_String = " ";
2488-
if (DT && DT->isVectorShuffle()) {
2489-
VS_String = "VS ";
2490-
}
2491-
if (DT && !DT->isVectorShuffle()) {
2492-
VS_String = "SCA";
2493-
}
2494-
if (DT && DT->isNoOp()) {
2495-
VS_String = "NOP";
2496-
}
2497-
2498-
Info += VS_String + (CanClone ? " * " : " ");
2499-
2513+
std::string ChoosingMode = ChooseByRP ? "RP" : "MW";
2514+
ChoosingMode += CanClone ? "*" : "";
2515+
std::string Info = formatDebugInfo(RT.getCurrentPressure(), RT.estimate(Node->I),
2516+
ChoosingMode,
2517+
getVectorShuffleString(Node->I, VSA, RCA));
25002518
PrintDump(Info);
25012519
Node->print(*LogStream);
25022520

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2025 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
9+
; REQUIRES: regkeys
10+
; RUN: igc_opt --opaque-pointers --regkey DisableCodeScheduling=0 --regkey EnableCodeSchedulingIfNoSpills=1 \
11+
; RUN: --regkey PrintToConsole=1 --regkey DumpCodeScheduling=1 --igc-code-scheduling \
12+
; RUN: --regkey CodeSchedulingRPThreshold=-512 \
13+
; RUN: --regkey ForceOCLSIMDWidth=16 -S %s 2>&1 | FileCheck %s
14+
15+
16+
; Checks that the register pressure is estimated correctly for the special cases related to
17+
; addrspace cast and pointer/int casts
18+
19+
define void @test_lsc2dblockread(i64 %base_addr, i64 %offset, i32 %shift_val) {
20+
; CHECK: Function test_lsc2dblockread
21+
; CHECK: Original schedule: entry
22+
23+
entry:
24+
; Calculate address components
25+
; The first instruction adds a new value (the argument doesn't die)
26+
27+
; CHECK: {{\([0-9]+,[ ]*64[ ]*\) OG:[ A-Z]*}} [[ADDR_SHIFT:%.*]] = shl i32 [[SHIFT_VAL:%.*]], 6
28+
%addr_shift = shl i32 %shift_val, 6
29+
30+
; The second instruction should also estimate the register pressure increase
31+
; Because we extend to a larger data type
32+
33+
; CHECK: {{\([0-9]+,[ ]*64[ ]*\) OG:[ A-Z]*}} [[ADDR_SHIFT_EXT:%.*]] = zext i32 [[ADDR_SHIFT]] to i64
34+
%addr_shift_ext = zext i32 %addr_shift to i64
35+
; CHECK: {{\([0-9]+,[ ]*0[ ]*\) OG:[ A-Z]*}} [[BASE_PLUS_SHIFT:%.*]] = add i64 [[BASE_ADDR:%.*]], [[ADDR_SHIFT_EXT]]
36+
%base_plus_shift = add i64 %base_addr, %addr_shift_ext
37+
38+
; No changes as we continue calculating the address and perform various casts
39+
40+
; CHECK: {{\([0-9]+,[ ]*0[ ]*\) OG:[ A-Z]*}} [[FINAL_ADDR:%.*]] = add i64 [[BASE_PLUS_SHIFT]], [[OFFSET:%.*]]
41+
%final_addr = add i64 %base_plus_shift, %offset
42+
43+
; Convert to pointer and back through address spaces
44+
; CHECK: {{\([0-9]+,[ ]*0[ ]*\) OG:[ A-Z]*}} [[PTR_AS1:%.*]] = inttoptr i64 [[FINAL_ADDR]] to ptr addrspace(1)
45+
%ptr_as1 = inttoptr i64 %final_addr to i8 addrspace(1)*
46+
; CHECK: {{\([0-9]+,[ ]*0[ ]*\) OG:[ A-Z]*}} [[PTR_AS4:%.*]] = addrspacecast ptr addrspace(1) [[PTR_AS1]] to ptr addrspace(4)
47+
%ptr_as4 = addrspacecast i8 addrspace(1)* %ptr_as1 to i8 addrspace(4)*
48+
; CHECK: {{\([0-9]+,[ ]*0[ ]*\) OG:[ A-Z]*}} [[ADDR_AS4:%.*]] = ptrtoint ptr addrspace(4) [[PTR_AS4]] to i64
49+
%addr_as4 = ptrtoint i8 addrspace(4)* %ptr_as4 to i64
50+
51+
; Convert to pointer pair
52+
; CHECK: {{\([0-9]+,[ ]*0[ ]*\) OG:[ A-Z]*}} [[PTR_PAIR:%.*]] = call { i32, i32 } @llvm.genx.GenISA.ptr.to.pair.p4i8(ptr addrspace(4) [[PTR_AS4]])
53+
%ptr_pair = call { i32, i32 } @llvm.genx.GenISA.ptr.to.pair.p4i8(i8 addrspace(4)* %ptr_as4)
54+
55+
; Currently it's estimated as +64 -64: essentially correct, but
56+
; may be not perfect for the instruction choosing heuristics
57+
; CHECK: {{\([0-9]+,[ ]*64[ ]*\) OG:[ A-Z]*}} [[PAIR_LOW:%.*]] = extractvalue { i32, i32 } [[PTR_PAIR]], 0
58+
%pair_low = extractvalue { i32, i32 } %ptr_pair, 0
59+
; CHECK: {{\([0-9]+,[ ]*-64[ ]*\) OG:[ A-Z]*}} [[PAIR_HIGH:%.*]] = extractvalue { i32, i32 } [[PTR_PAIR]], 1
60+
%pair_high = extractvalue { i32, i32 } %ptr_pair, 1
61+
62+
; Prepare the first parameter (base address as i64)
63+
; CHECK: {{\([0-9]+,[ ]*0[ ]*\) OG:[ A-Z]*}} [[ALIGNED_LOW:%.*]] = and i32 [[PAIR_LOW]], -64
64+
%aligned_low = and i32 %pair_low, -64
65+
66+
; Estimated as +128 -128
67+
; CHECK: {{\([0-9]+,[ ]*128[ ]*\) OG:[ A-Z]*}} [[VEC_LOW:%.*]] = insertelement <2 x i32> undef, i32 [[ALIGNED_LOW]], i32 0
68+
%vec_low = insertelement <2 x i32> undef, i32 %aligned_low, i32 0
69+
; CHECK: {{\([0-9]+,[ ]*-128[ ]*\) OG:[ A-Z]*}} [[VEC_PAIR:%.*]] = insertelement <2 x i32> [[VEC_LOW]], i32 [[PAIR_HIGH]], i32 1
70+
%vec_pair = insertelement <2 x i32> %vec_low, i32 %pair_high, i32 1
71+
72+
; Bitcast is a no-op
73+
; CHECK: {{\([0-9]+,[ ]*0[ ]*\) OG:[ A-Z]*}} [[BASE_ADDR_PARAM:%.*]] = bitcast <2 x i32> [[VEC_PAIR]] to i64
74+
%base_addr_param = bitcast <2 x i32> %vec_pair to i64
75+
76+
; Prepare the coordinate parameter
77+
; Truncation decreases the register pressure as the return type becomes smaller
78+
; CHECK: {{\([0-9]+,[ ]*-64[ ]*\) OG:[ A-Z]*}} [[ADDR_TRUNC:%.*]] = trunc i64 [[ADDR_AS4]] to i32
79+
%addr_trunc = trunc i64 %addr_as4 to i32
80+
; CHECK: {{\([0-9]+,[ ]*0[ ]*\) OG:[ A-Z]*}} [[COORD_SHIFT:%.*]] = lshr i32 [[ADDR_TRUNC]], 1
81+
%coord_shift = lshr i32 %addr_trunc, 1
82+
; CHECK: {{\([0-9]+,[ ]*0[ ]*\) OG:[ A-Z]*}} [[COORD_MASKED:%.*]] = and i32 [[COORD_SHIFT]], 31
83+
%coord_masked = and i32 %coord_shift, 31
84+
85+
; Execute the LSC2DBlockRead: 256 (i16 * v8 * SIMD16) - 128 (i64 x SIMD16) - 64 (i32 * SIMD16)
86+
; CHECK: {{\([0-9]+,[ ]*64[ ]*\) OG:[ A-Z]*}} [[LOAD_RESULT:%.*]] = call <8 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64 [[BASE_ADDR_PARAM]], i32 4095, i32 7, i32 4095, i32 [[COORD_MASKED]], i32 0, i32 16, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0)
87+
%load_result = call <8 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64 %base_addr_param, i32 4095, i32 7, i32 4095, i32 %coord_masked, i32 0, i32 16, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0)
88+
89+
; Execute the DPAS
90+
; Returns the 2x larger type as the load, load dies
91+
; CHECK: {{\([0-9]+,[ ]*256[ ]*\) OG:[ A-Z]*}} [[DPAS_RESULT:%.*]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> zeroinitializer, <8 x i16> [[LOAD_RESULT]], <8 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false)
92+
%dpas_result = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> zeroinitializer, <8 x i16> %load_result, <8 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false)
93+
; CHECK: {{\([0-9]+,[ ]*0[ ]*\) OG:[ A-Z]*}} ret void
94+
ret void
95+
}
96+
97+
98+
declare <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(
99+
<8 x float>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1) #1
100+
101+
declare <8 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32) #1
102+
declare { i32, i32 } @llvm.genx.GenISA.ptr.to.pair.p4i8(i8 addrspace(4)*) #3
103+
104+
attributes #0 = { convergent nounwind }
105+
attributes #1 = { convergent nounwind readnone willreturn }
106+
attributes #2 = { nofree nosync nounwind readnone speculatable willreturn }
107+
attributes #3 = { nounwind readnone willreturn }

IGC/Compiler/tests/CodeScheduling/reg-est-vector-cases-simd16.ll

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ entry:
113113

114114
define spir_kernel void @vector_shuffle(ptr addrspace(1) %A) {
115115
; CHECK: Function vector_shuffle
116+
; CHECK: Greedy MW attempt
117+
116118
; CHECK: {{([0-9]+,[ ]*[0-9]+[ ]*).*[ ]*}} [[BASE_ADDR:%.*]] = ptrtoint ptr addrspace(1) [[A:%.*]] to i64
117119

118120
; (6, 512 ) MW: Node #1, MW: 3000 %load2d = call <16 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v16i16(i64 %base_addr, i32 127, i32 1023, i32 127, i32 0, i32 0, i32 16, i32 16, i32 16, i32 2, i1 false, i1 false, i32 4)
@@ -207,6 +209,7 @@ entry:
207209

208210
define spir_kernel void @coalesced_scalars(ptr addrspace(1) %0) {
209211
; CHECK: Function coalesced_scalars
212+
; CHECK: Greedy MW attempt
210213

211214
; the IE instructions are marked as SCA. First IE adds regpressure
212215
; then the last usage of the scalar (fadd) kills the hanging values
@@ -303,6 +306,7 @@ define spir_kernel void @coalesced_scalars(ptr addrspace(1) %0) {
303306

304307
define spir_kernel void @vector_to_scalars_pattern(ptr addrspace(1) %A) {
305308
; CHECK: Function vector_to_scalars_pattern
309+
; CHECK: Greedy MW attempt
306310

307311
; DPAS increases regpressure
308312
; CHECK: {{([0-9]+,[ ]*512[ ]*).*[ ]*}} [[DPAS:%.*]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> zeroinitializer, <8 x i16> undef, <8 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false)

IGC/Compiler/tests/CodeScheduling/reg-est-vector-cases-simd32.ll

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
define spir_kernel void @vector_shuffle_no_op(ptr addrspace(1) %A) {
2020
; CHECK: Function vector_shuffle_no_op
2121
; CHECK: Greedy MW attempt
22+
2223
; CHECK: {{([0-9]+,[ ]*[0-9]+[ ]*).*[ ]*}} [[BASE_ADDR:%.*]] = ptrtoint ptr addrspace(1) [[A:%.*]] to i64
2324

2425
; (6, 512 ) MW: Node #1, MW: 3000 %load2d = call <8 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64 %base_addr, i32 127, i32 1023, i32 127, i32 0, i32 0, i32 16, i32 16, i32 16, i32 2, i1 false, i1 false, i32 4)
@@ -81,6 +82,8 @@ entry:
8182

8283
define spir_kernel void @vector_shuffle(ptr addrspace(1) %A) {
8384
; CHECK: Function vector_shuffle
85+
; CHECK: Greedy MW attempt
86+
8487
; CHECK: {{([0-9]+,[ ]*[0-9]+[ ]*).*[ ]*}} [[BASE_ADDR:%.*]] = ptrtoint ptr addrspace(1) [[A:%.*]] to i64
8588

8689
; (6, 512 ) MW: Node #1, MW: 3000 %load2d = call <8 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64 %base_addr, i32 127, i32 1023, i32 127, i32 0, i32 0, i32 16, i32 16, i32 16, i32 2, i1 false, i1 false, i32 4)
@@ -143,6 +146,7 @@ entry:
143146

144147
define spir_kernel void @coalesced_scalars(ptr addrspace(1) %0) {
145148
; CHECK: Function coalesced_scalars
149+
; CHECK: Greedy MW attempt
146150

147151
; the IE instructions are marked as SCA. First IE adds regpressure
148152
; then the last usage of the scalar (fadd) kills the hanging values
@@ -207,6 +211,7 @@ define spir_kernel void @coalesced_scalars(ptr addrspace(1) %0) {
207211

208212
define spir_kernel void @vector_to_scalars_pattern(ptr addrspace(1) %A) {
209213
; CHECK: Function vector_to_scalars_pattern
214+
; CHECK: Greedy MW attempt
210215

211216
; DPAS increases regpressure
212217
; CHECK: {{([0-9]+,[ ]*512[ ]*).*[ ]*}} [[DPAS:%.*]] = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> zeroinitializer, <4 x i16> undef, <4 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false)

0 commit comments

Comments
 (0)