AMDGPU: Respect amdgpu-no-agpr in functions and with calls (#128147)

arsenm · web-flow · commit ccad5e77442f · 2025-02-23T09:00:37.000+07:00
Remove the MIR scan to detect whether AGPRs are used or not,
and the special case for callable functions. This behavior was
confusing, and not overridable. The amdgpu-no-agpr attribute was
intended to avoid this imprecise heuristic for how many AGPRs to
allocate. It was also too confusing to make this interact with
the pending amdgpu-num-agpr replacement for amdgpu-no-agpr.

Also adds an xfail-ish test where the register allocator asserts
after allocation fails which I ran into.

Future work should reintroduce a more refined MIR scan to estimate
AGPR pressure for how to split AGPRs and VGPRs.
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -64,6 +64,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
   }
 
   MayNeedAGPRs = ST.hasMAIInsts();
+  if (ST.hasGFX90AInsts() &&
+      ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
+      !mayUseAGPRs(F))
+    MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
 
   if (AMDGPU::isChainCC(CC)) {
     // Chain functions don't receive an SP from their caller, but are free to
@@ -98,13 +102,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
       ImplicitArgPtr = true;
   } else {
     ImplicitArgPtr = false;
-    MaxKernArgAlign = std::max(ST.getAlignmentForImplicitArgPtr(),
-                               MaxKernArgAlign);
-
-    if (ST.hasGFX90AInsts() &&
-        ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
-        !mayUseAGPRs(F))
-      MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
+    MaxKernArgAlign =
+        std::max(ST.getAlignmentForImplicitArgPtr(), MaxKernArgAlign);
   }
 
   if (!AMDGPU::isGraphics(CC) ||
@@ -783,44 +782,3 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
 bool SIMachineFunctionInfo::mayUseAGPRs(const Function &F) const {
   return !F.hasFnAttribute("amdgpu-no-agpr");
 }
-
-bool SIMachineFunctionInfo::usesAGPRs(const MachineFunction &MF) const {
-  if (UsesAGPRs)
-    return *UsesAGPRs;
-
-  if (!mayNeedAGPRs()) {
-    UsesAGPRs = false;
-    return false;
-  }
-
-  if (!AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv()) ||
-      MF.getFrameInfo().hasCalls()) {
-    UsesAGPRs = true;
-    return true;
-  }
-
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
-
-  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
-    const Register Reg = Register::index2VirtReg(I);
-    const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
-    if (RC && SIRegisterInfo::isAGPRClass(RC)) {
-      UsesAGPRs = true;
-      return true;
-    }
-    if (!RC && !MRI.use_empty(Reg) && MRI.getType(Reg).isValid()) {
-      // Defer caching UsesAGPRs, function might not yet been regbank selected.
-      return true;
-    }
-  }
-
-  for (MCRegister Reg : AMDGPU::AGPR_32RegClass) {
-    if (MRI.isPhysRegUsed(Reg)) {
-      UsesAGPRs = true;
-      return true;
-    }
-  }
-
-  UsesAGPRs = false;
-  return false;
-}
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -494,8 +494,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   // scheduler stage.
   unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
 
-  mutable std::optional<bool> UsesAGPRs;
-
   MCPhysReg getNextUserSGPR() const;
 
   MCPhysReg getNextSystemSGPR() const;
@@ -1126,9 +1124,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   // has a call which may use it.
   bool mayUseAGPRs(const Function &F) const;
 
-  // \returns true if a function needs or may need AGPRs.
-  bool usesAGPRs(const MachineFunction &MF) const;
-
   /// \returns Default/requested number of work groups for this function.
   SmallVector<unsigned> getMaxNumWorkGroups() const { return MaxNumWorkGroups; }
 
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -585,7 +585,7 @@ SIRegisterInfo::getMaxNumVectorRegs(const MachineFunction &MF) const {
   // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
   //       register file accordingly.
   if (ST.hasGFX90AInsts()) {
-    if (MFI->usesAGPRs(MF)) {
+    if (MFI->mayNeedAGPRs()) {
       MaxNumVGPRs /= 2;
       MaxNumAGPRs = MaxNumVGPRs;
     } else {
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers-assertion-after-ra-failure.xfail.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers-assertion-after-ra-failure.xfail.ll
@@ -0,0 +1,23 @@
+; REQUIRES: asserts
+; RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %s 2>&1 | FileCheck -check-prefix=CRASH %s
+
+; CRASH: error: <unknown>:0:0: no registers from class available to allocate in function 'no_free_vgprs_at_agpr_to_agpr_copy'
+; CRASH: Cannot access invalid iterator
+
+define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
+  %asm = call { <32 x i32>, <16 x float> } asm sideeffect "; def $0 $1", "=${v[0:31]},=${a[0:15]}"()
+  %vgpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 0
+  %agpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 1
+  %mfma = call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %v0, float %v1, <16 x float> %agpr0, i32 0, i32 0, i32 0)
+  %agpr1 = call i32 asm sideeffect "; copy ", "={a1},a,~{a[0:15]}"(<16 x float> %agpr0)
+  %agpr2 = call i32 asm sideeffect "; copy ", "={a2},a,{a[0:15]}"(i32 %agpr1, <16 x float> %mfma)
+  call void asm sideeffect "; use $0 $1", "{a3},{v[0:31]}"(i32 %agpr2, <32 x i32> %vgpr0)
+  ret void
+}
+
+declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32 immarg, i32 immarg, i32 immarg) #1
+declare noundef i32 @llvm.amdgcn.workitem.id.x() #2
+
+attributes #0 = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="6,6" }
+attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -240,7 +240,7 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
 }
 
 ; Check that we do make use of v32 if there are no AGPRs present in the function
-define amdgpu_kernel void @no_agpr_no_reserve(ptr addrspace(1) %arg) #0 {
+define amdgpu_kernel void @no_agpr_no_reserve(ptr addrspace(1) %arg) #5 {
 ; GFX908-LABEL: no_agpr_no_reserve:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -1144,5 +1144,6 @@ declare i32 @llvm.amdgcn.workitem.id.x() #2
 attributes #0 = { "amdgpu-waves-per-eu"="6,6" }
 attributes #1 = { convergent nounwind readnone willreturn }
 attributes #2 = { nounwind readnone willreturn }
-attributes #3 = { "amdgpu-waves-per-eu"="7,7" }
+attributes #3 = { "amdgpu-waves-per-eu"="7,7" "amdgpu-no-agpr" }
 attributes #4 = { "amdgpu-waves-per-eu"="6,6" "amdgpu-flat-work-group-size"="1024,1024" }
+attributes #5 = { "amdgpu-waves-per-eu"="6,6" "amdgpu-no-agpr" }
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
@@ -94,9 +94,20 @@ bb3:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_nonentry:
+; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_nonentry_noagpr:
+; GFX908: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
+; GFX90A: v_mfma_f32_32x32x1{{.*}} v[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, v[{{[0-9:]+}}]
+define void @test_mfma_f32_32x32x1f32_nonentry_noagpr(ptr addrspace(1) %arg) #0 {
+bb:
+  %in.1 = load <32 x float>, ptr addrspace(1) %arg
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
+  store <32 x float> %mai.1, ptr addrspace(1) %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_nonentry_with_agpr:
 ; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
-define void @test_mfma_f32_32x32x1f32_nonentry(ptr addrspace(1) %arg) #0 {
+define void @test_mfma_f32_32x32x1f32_nonentry_with_agpr(ptr addrspace(1) %arg) #3 {
 bb:
   %in.1 = load <32 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
@@ -109,3 +120,4 @@ declare void @foo()
 attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" "amdgpu-no-agpr" }
 attributes #1 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" }
 attributes #2 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-agpr" }
+attributes #3 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" }
diff --git a/llvm/test/CodeGen/AMDGPU/spill-regpressure-less.mir b/llvm/test/CodeGen/AMDGPU/spill-regpressure-less.mir
@@ -6,7 +6,7 @@
     ret void
   }
 
-  attributes #0 = { "amdgpu-waves-per-eu"="8,8" }
+  attributes #0 = { "amdgpu-waves-per-eu"="8,8" "amdgpu-no-agpr" }
 ...
 
 ---
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll

Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,7 @@`
`6`	`6`	`ret void`
`7`	`7`	`}`
`8`	`8`
`9`		`- attributes #0 = { "amdgpu-waves-per-eu"="8,8" }`
	`9`	`+ attributes #0 = { "amdgpu-waves-per-eu"="8,8" "amdgpu-no-agpr" }`
`10`	`10`	`...`
`11`	`11`
`12`	`12`	`---`