Improve register estimation for CodeScheduling

admitric · igcbot · commit ea015d44a9e1 · 2025-09-17T15:08:54.000+02:00
- Fix the incorrect estimation of the initial register pressure
- Support more special cases for code with various casts in the
CodeScheduling's RegisterPressureTracker
diff --git a/IGC/Compiler/CISACodeGen/CodeScheduling.cpp b/IGC/Compiler/CISACodeGen/CodeScheduling.cpp
@@ -459,7 +459,9 @@ class RegisterPressureTracker {
     if (!I)
       return V;
 
-    if (isNoOpInst(I, CTX)) {
+    bool IsAddrSpaceCast = isa<AddrSpaceCastInst>(I);
+
+    if (isNoOpInst(I, CTX) || IsAddrSpaceCast) {
       return getRealOp(I->getOperand(0));
     }
     return V;
@@ -568,7 +570,10 @@ class RegisterPressureTracker {
   }
 
   int32_t estimateOrUpdateImpl(Instruction *I, bool Update) {
-    if (IGCLLVM::isDebugOrPseudoInst(*I) || I->isLifetimeStartOrEnd() || isNoOpInst(I, CTX)) {
+    auto *Intr = dyn_cast<GenIntrinsicInst>(I);
+    bool IsNoOpIntr = Intr && (Intr->getIntrinsicID() == GenISAIntrinsic::GenISA_ptr_to_pair);
+
+    if (IGCLLVM::isDebugOrPseudoInst(*I) || I->isLifetimeStartOrEnd() || isNoOpInst(I, CTX) || IsNoOpIntr) {
       // NoOp instructions do not change register pressure
       if (Update)
         PrintDumpLevel(VerbosityLevel::High, "NoOp instruction: " << getName(I) << "\n");
@@ -993,8 +998,20 @@ class BBScheduler {
 
     int32_t MaxOriginalRegpressure = 0;
     bool OriginalScheduleCanHaveSpills = false;
+
+    PrintDump("Original schedule: " << BBName << "\n");
     for (auto &I : *BB) {
-      RPT.update(&I);
+      std::string Info;
+      if (isa<PHINode>(&I)) {
+        // PHIs are already included in the initial regpressure
+        Info = formatDebugInfo(RPT.getCurrentPressure(), 0, "Phi", getVectorShuffleString(&I, VSA, RCA));
+      } else {
+        int32_t Estimate = RPT.update(&I);
+        Info = formatDebugInfo(RPT.getCurrentPressure(), Estimate, "OG", getVectorShuffleString(&I, VSA, RCA));
+      }
+      PrintDump(Info);
+      PrintInstructionDump(&I);
+
       MaxOriginalRegpressure = std::max(MaxOriginalRegpressure, RPT.getCurrentPressure());
       if (RPT.isRegpressureCritical()) {
         OriginalScheduleCanHaveSpills = true;
@@ -1202,6 +1219,45 @@ class BBScheduler {
   SchedulingConfig &C;
   llvm::raw_ostream *LogStream;
 
+  // Helper function to format debug information string
+  static std::string formatDebugInfo(int32_t CurrentPressure, int32_t Estimate, const std::string Type,
+                                     const std::string AddString = "") {
+    const int ESTIMATION_NUMBERS_WIDTH = 12;
+    const int INFO_WIDTH = 20;
+    std::string Info = std::to_string(CurrentPressure) + ", " + std::to_string(Estimate);
+    Info.resize(ESTIMATION_NUMBERS_WIDTH, ' ');
+    Info = "(" + Info + ") " + Type + ": ";
+    Info.resize(INFO_WIDTH, ' ');
+
+    if (!AddString.empty()) {
+      Info += AddString;
+    }
+
+    return Info;
+  }
+
+  // Helper function to get vector shuffle string
+  static std::string getVectorShuffleString(Instruction *I, VectorShuffleAnalysis *VSA, RematChainsAnalysis *RCA) {
+    auto *DT = VSA->getDestVector(I);
+    auto *V2SP = VSA->getVectorToScalarsPattern(I);
+    auto *RCP = RCA->getRematChainPattern(I);
+
+    std::string VS_String = "    ";
+    if (RCP) {
+      VS_String = "REM ";
+    } else if (DT && DT->isNoOp()) {
+      VS_String = "NOP ";
+    } else if (DT && DT->isVectorShuffle()) {
+      VS_String = "VS  ";
+    } else if (DT && !DT->isVectorShuffle()) {
+      VS_String = "SCA ";
+    } else if (V2SP) {
+      VS_String = "V2S ";
+    }
+
+    return VS_String;
+  }
+
   class InstructionNode {
   public:
     InstructionNode(Instruction *I, uint32_t N) : I(I), OriginalPosition(N) {
@@ -1222,8 +1278,9 @@ class BBScheduler {
 
     void print(llvm::raw_ostream &LogStream) {
       if (IGC_IS_FLAG_ENABLED(DumpCodeScheduling)) {
-        std::string Info = "Node #" + std::to_string(OriginalPosition) + ", MW: " + std::to_string(MaxWeight) + " ";
-        Info.resize(23, ' ');
+        const int INFO_WIDTH = 16;
+        std::string Info = "#" + std::to_string(OriginalPosition) + ", MW: " + std::to_string(MaxWeight) + " ";
+        Info.resize(INFO_WIDTH, ' ');
         LogStream << Info;
         I->print(LogStream);
         LogStream << "\n";
@@ -2335,31 +2392,8 @@ class BBScheduler {
           }
         }
 
-        std::string Info = std::to_string(RT.getCurrentPressure()) + ", " + std::to_string(RT.estimate(Node->I));
-        Info.resize(11, ' ');
-        Info = "(" + Info + ") Im: ";
-        Info.resize(20, ' ');
-
-        auto *V2SP = VSA->getVectorToScalarsPattern(Node->I);
-        auto *RCP = RCA->getRematChainPattern(Node->I);
-
-        if (RCP) {
-          VS_String = "REM";
-        }
-        if (DT && DT->isVectorShuffle()) {
-          VS_String = "VS ";
-        }
-        if (DT && !DT->isVectorShuffle()) {
-          VS_String = "SCA";
-        }
-        if (DT && DT->isNoOp()) {
-          VS_String = "NOP";
-        }
-        if (V2SP) {
-          VS_String = "V2S";
-        }
-
-        Info += VS_String + "   ";
+        std::string Info = formatDebugInfo(
+          RT.getCurrentPressure(), RT.estimate(Node->I), "Im", getVectorShuffleString(Node->I, VSA, RCA));
 
         PrintDump(Info);
         Node->print(*LogStream);
@@ -2476,27 +2510,11 @@ class BBScheduler {
           AllInstructionsScheduledByRP = false;
         }
 
-        // Dump the info
-        std::string Info = std::to_string(RT.getCurrentPressure()) + ", " + std::to_string(RT.estimate(Node->I));
-        Info.resize(11, ' ');
-        Info = "(" + Info + ") " + (ChooseByRP ? "RP" : "MW") + ": ";
-        Info.resize(20, ' ');
-
-        auto *DT = VSA->getDestVector(Node->I);
-
-        std::string VS_String = "   ";
-        if (DT && DT->isVectorShuffle()) {
-          VS_String = "VS ";
-        }
-        if (DT && !DT->isVectorShuffle()) {
-          VS_String = "SCA";
-        }
-        if (DT && DT->isNoOp()) {
-          VS_String = "NOP";
-        }
-
-        Info += VS_String + (CanClone ? " * " : "   ");
-
+        std::string ChoosingMode = ChooseByRP ? "RP" : "MW";
+        ChoosingMode += CanClone ? "*" : "";
+        std::string Info = formatDebugInfo(RT.getCurrentPressure(), RT.estimate(Node->I),
+                           ChoosingMode,
+                           getVectorShuffleString(Node->I, VSA, RCA));
         PrintDump(Info);
         Node->print(*LogStream);
 
diff --git a/IGC/Compiler/tests/CodeScheduling/reg-est-ptrtopair.ll b/IGC/Compiler/tests/CodeScheduling/reg-est-ptrtopair.ll
@@ -0,0 +1,107 @@
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2025 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+
+; REQUIRES: regkeys
+; RUN: igc_opt --opaque-pointers --regkey DisableCodeScheduling=0 --regkey EnableCodeSchedulingIfNoSpills=1 \
+; RUN:         --regkey PrintToConsole=1 --regkey DumpCodeScheduling=1 --igc-code-scheduling \
+; RUN:         --regkey CodeSchedulingRPThreshold=-512 \
+; RUN:         --regkey ForceOCLSIMDWidth=16 -S %s 2>&1 | FileCheck %s
+
+
+; Checks that the register pressure is estimated correctly for the special cases related to
+; addrspace cast and pointer/int casts
+
+define void @test_lsc2dblockread(i64 %base_addr, i64 %offset, i32 %shift_val) {
+; CHECK: Function test_lsc2dblockread
+; CHECK: Original schedule: entry
+
+entry:
+; Calculate address components
+; The first instruction adds a new value (the argument doesn't die)
+
+; CHECK: {{\([0-9]+,[ ]*64[ ]*\) OG:[ A-Z]*}} [[ADDR_SHIFT:%.*]] = shl i32 [[SHIFT_VAL:%.*]], 6
+  %addr_shift = shl i32 %shift_val, 6
+
+; The second instruction should also estimate the register pressure increase
+; Because we extend to a larger data type
+
+; CHECK: {{\([0-9]+,[ ]*64[ ]*\) OG:[ A-Z]*}} [[ADDR_SHIFT_EXT:%.*]] = zext i32 [[ADDR_SHIFT]] to i64
+  %addr_shift_ext = zext i32 %addr_shift to i64
+; CHECK: {{\([0-9]+,[ ]*0[ ]*\) OG:[ A-Z]*}} [[BASE_PLUS_SHIFT:%.*]] = add i64 [[BASE_ADDR:%.*]], [[ADDR_SHIFT_EXT]]
+  %base_plus_shift = add i64 %base_addr, %addr_shift_ext
+
+; No changes as we continue calculating the address and perform various casts
+
+; CHECK: {{\([0-9]+,[ ]*0[ ]*\) OG:[ A-Z]*}} [[FINAL_ADDR:%.*]] = add i64 [[BASE_PLUS_SHIFT]], [[OFFSET:%.*]]
+  %final_addr = add i64 %base_plus_shift, %offset
+
+; Convert to pointer and back through address spaces
+; CHECK: {{\([0-9]+,[ ]*0[ ]*\) OG:[ A-Z]*}} [[PTR_AS1:%.*]] = inttoptr i64 [[FINAL_ADDR]] to ptr addrspace(1)
+  %ptr_as1 = inttoptr i64 %final_addr to i8 addrspace(1)*
+; CHECK: {{\([0-9]+,[ ]*0[ ]*\) OG:[ A-Z]*}} [[PTR_AS4:%.*]] = addrspacecast ptr addrspace(1) [[PTR_AS1]] to ptr addrspace(4)
+  %ptr_as4 = addrspacecast i8 addrspace(1)* %ptr_as1 to i8 addrspace(4)*
+; CHECK: {{\([0-9]+,[ ]*0[ ]*\) OG:[ A-Z]*}} [[ADDR_AS4:%.*]] = ptrtoint ptr addrspace(4) [[PTR_AS4]] to i64
+  %addr_as4 = ptrtoint i8 addrspace(4)* %ptr_as4 to i64
+
+; Convert to pointer pair
+; CHECK: {{\([0-9]+,[ ]*0[ ]*\) OG:[ A-Z]*}} [[PTR_PAIR:%.*]] = call { i32, i32 } @llvm.genx.GenISA.ptr.to.pair.p4i8(ptr addrspace(4) [[PTR_AS4]])
+  %ptr_pair = call { i32, i32 } @llvm.genx.GenISA.ptr.to.pair.p4i8(i8 addrspace(4)* %ptr_as4)
+
+; Currently it's estimated as +64 -64: essentially correct, but
+; may be not perfect for the instruction choosing heuristics
+; CHECK: {{\([0-9]+,[ ]*64[ ]*\) OG:[ A-Z]*}} [[PAIR_LOW:%.*]] = extractvalue { i32, i32 } [[PTR_PAIR]], 0
+  %pair_low = extractvalue { i32, i32 } %ptr_pair, 0
+; CHECK: {{\([0-9]+,[ ]*-64[ ]*\) OG:[ A-Z]*}} [[PAIR_HIGH:%.*]] = extractvalue { i32, i32 } [[PTR_PAIR]], 1
+  %pair_high = extractvalue { i32, i32 } %ptr_pair, 1
+
+; Prepare the first parameter (base address as i64)
+; CHECK: {{\([0-9]+,[ ]*0[ ]*\) OG:[ A-Z]*}} [[ALIGNED_LOW:%.*]] = and i32 [[PAIR_LOW]], -64
+  %aligned_low = and i32 %pair_low, -64
+
+; Estimated as +128 -128
+; CHECK: {{\([0-9]+,[ ]*128[ ]*\) OG:[ A-Z]*}} [[VEC_LOW:%.*]] = insertelement <2 x i32> undef, i32 [[ALIGNED_LOW]], i32 0
+  %vec_low = insertelement <2 x i32> undef, i32 %aligned_low, i32 0
+; CHECK: {{\([0-9]+,[ ]*-128[ ]*\) OG:[ A-Z]*}} [[VEC_PAIR:%.*]] = insertelement <2 x i32> [[VEC_LOW]], i32 [[PAIR_HIGH]], i32 1
+  %vec_pair = insertelement <2 x i32> %vec_low, i32 %pair_high, i32 1
+
+; Bitcast is a no-op
+; CHECK: {{\([0-9]+,[ ]*0[ ]*\) OG:[ A-Z]*}} [[BASE_ADDR_PARAM:%.*]] = bitcast <2 x i32> [[VEC_PAIR]] to i64
+  %base_addr_param = bitcast <2 x i32> %vec_pair to i64
+
+; Prepare the coordinate parameter
+; Truncation decreases the register pressure as the return type becomes smaller
+; CHECK: {{\([0-9]+,[ ]*-64[ ]*\) OG:[ A-Z]*}} [[ADDR_TRUNC:%.*]] = trunc i64 [[ADDR_AS4]] to i32
+  %addr_trunc = trunc i64 %addr_as4 to i32
+; CHECK: {{\([0-9]+,[ ]*0[ ]*\) OG:[ A-Z]*}} [[COORD_SHIFT:%.*]] = lshr i32 [[ADDR_TRUNC]], 1
+  %coord_shift = lshr i32 %addr_trunc, 1
+; CHECK: {{\([0-9]+,[ ]*0[ ]*\) OG:[ A-Z]*}} [[COORD_MASKED:%.*]] = and i32 [[COORD_SHIFT]], 31
+  %coord_masked = and i32 %coord_shift, 31
+
+; Execute the LSC2DBlockRead: 256 (i16 * v8 * SIMD16) - 128 (i64 x SIMD16) - 64 (i32 * SIMD16)
+; CHECK: {{\([0-9]+,[ ]*64[ ]*\) OG:[ A-Z]*}} [[LOAD_RESULT:%.*]] = call <8 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64 [[BASE_ADDR_PARAM]], i32 4095, i32 7, i32 4095, i32 [[COORD_MASKED]], i32 0, i32 16, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0)
+  %load_result = call <8 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64 %base_addr_param, i32 4095, i32 7, i32 4095, i32 %coord_masked, i32 0, i32 16, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0)
+
+; Execute the DPAS
+; Returns the 2x larger type as the load, load dies
+; CHECK: {{\([0-9]+,[ ]*256[ ]*\) OG:[ A-Z]*}} [[DPAS_RESULT:%.*]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> zeroinitializer, <8 x i16> [[LOAD_RESULT]], <8 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false)
+  %dpas_result = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> zeroinitializer, <8 x i16> %load_result, <8 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false)
+; CHECK: {{\([0-9]+,[ ]*0[ ]*\) OG:[ A-Z]*}} ret void
+  ret void
+}
+
+
+declare <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(
+  <8 x float>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1) #1
+
+declare <8 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32) #1
+declare { i32, i32 } @llvm.genx.GenISA.ptr.to.pair.p4i8(i8 addrspace(4)*) #3
+
+attributes #0 = { convergent nounwind }
+attributes #1 = { convergent nounwind readnone willreturn }
+attributes #2 = { nofree nosync nounwind readnone speculatable willreturn }
+attributes #3 = { nounwind readnone willreturn }
diff --git a/IGC/Compiler/tests/CodeScheduling/reg-est-vector-cases-simd16.ll b/IGC/Compiler/tests/CodeScheduling/reg-est-vector-cases-simd16.ll
@@ -113,6 +113,8 @@ entry:
 
 define spir_kernel void @vector_shuffle(ptr addrspace(1) %A) {
 ; CHECK: Function vector_shuffle
+; CHECK: Greedy MW attempt
+
 ; CHECK: {{([0-9]+,[ ]*[0-9]+[ ]*).*[ ]*}}        [[BASE_ADDR:%.*]] = ptrtoint ptr addrspace(1) [[A:%.*]] to i64
 
 ;  (6, 512     ) MW:         Node #1, MW: 3000        %load2d = call <16 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v16i16(i64 %base_addr, i32 127, i32 1023, i32 127, i32 0, i32 0, i32 16, i32 16, i32 16, i32 2, i1 false, i1 false, i32 4)
@@ -207,6 +209,7 @@ entry:
 
 define spir_kernel void @coalesced_scalars(ptr addrspace(1) %0) {
 ; CHECK: Function coalesced_scalars
+; CHECK: Greedy MW attempt
 
 ;               the IE instructions are marked as SCA. First IE adds regpressure
 ;               then the last usage of the scalar (fadd) kills the hanging values
@@ -303,6 +306,7 @@ define spir_kernel void @coalesced_scalars(ptr addrspace(1) %0) {
 
 define spir_kernel void @vector_to_scalars_pattern(ptr addrspace(1) %A) {
 ; CHECK: Function vector_to_scalars_pattern
+; CHECK: Greedy MW attempt
 
 ;           DPAS increases regpressure
 ; CHECK: {{([0-9]+,[ ]*512[ ]*).*[ ]*}}           [[DPAS:%.*]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> zeroinitializer, <8 x i16> undef, <8 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false)
diff --git a/IGC/Compiler/tests/CodeScheduling/reg-est-vector-cases-simd32.ll b/IGC/Compiler/tests/CodeScheduling/reg-est-vector-cases-simd32.ll
@@ -19,6 +19,7 @@
 define spir_kernel void @vector_shuffle_no_op(ptr addrspace(1) %A) {
 ; CHECK: Function vector_shuffle_no_op
 ; CHECK: Greedy MW attempt
+
 ; CHECK: {{([0-9]+,[ ]*[0-9]+[ ]*).*[ ]*}}        [[BASE_ADDR:%.*]] = ptrtoint ptr addrspace(1) [[A:%.*]] to i64
 
 ;  (6, 512     ) MW:         Node #1, MW: 3000        %load2d = call <8 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64 %base_addr, i32 127, i32 1023, i32 127, i32 0, i32 0, i32 16, i32 16, i32 16, i32 2, i1 false, i1 false, i32 4)
@@ -81,6 +82,8 @@ entry:
 
 define spir_kernel void @vector_shuffle(ptr addrspace(1) %A) {
 ; CHECK: Function vector_shuffle
+; CHECK: Greedy MW attempt
+
 ; CHECK: {{([0-9]+,[ ]*[0-9]+[ ]*).*[ ]*}}        [[BASE_ADDR:%.*]] = ptrtoint ptr addrspace(1) [[A:%.*]] to i64
 
 ;  (6, 512     ) MW:         Node #1, MW: 3000        %load2d = call <8 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64 %base_addr, i32 127, i32 1023, i32 127, i32 0, i32 0, i32 16, i32 16, i32 16, i32 2, i1 false, i1 false, i32 4)
@@ -143,6 +146,7 @@ entry:
 
 define spir_kernel void @coalesced_scalars(ptr addrspace(1) %0) {
 ; CHECK: Function coalesced_scalars
+; CHECK: Greedy MW attempt
 
 ;               the IE instructions are marked as SCA. First IE adds regpressure
 ;               then the last usage of the scalar (fadd) kills the hanging values
@@ -207,6 +211,7 @@ define spir_kernel void @coalesced_scalars(ptr addrspace(1) %0) {
 
 define spir_kernel void @vector_to_scalars_pattern(ptr addrspace(1) %A) {
 ; CHECK: Function vector_to_scalars_pattern
+; CHECK: Greedy MW attempt
 
 ;           DPAS increases regpressure
 ; CHECK: {{([0-9]+,[ ]*512[ ]*).*[ ]*}}           [[DPAS:%.*]] = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> zeroinitializer, <4 x i16> undef, <4 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false)