Skip to content

Commit ccad5e7

Browse files
authored
AMDGPU: Respect amdgpu-no-agpr in functions and with calls (#128147)
Remove the MIR scan to detect whether AGPRs are used or not, and the special case for callable functions. This behavior was confusing, and not overridable. The amdgpu-no-agpr attribute was intended to avoid this imprecise heuristic for how many AGPRs to allocate. It was also too confusing to make this interact with the pending amdgpu-num-agpr replacement for amdgpu-no-agpr. Also adds an xfail-ish test where the register allocator asserts after allocation fails which I ran into. Future work should reintroduce a more refined MIR scan to estimate AGPR pressure for how to split AGPRs and VGPRs.
1 parent 6e61126 commit ccad5e7

8 files changed

+655
-64
lines changed

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

Lines changed: 6 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
6464
}
6565

6666
MayNeedAGPRs = ST.hasMAIInsts();
67+
if (ST.hasGFX90AInsts() &&
68+
ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
69+
!mayUseAGPRs(F))
70+
MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
6771

6872
if (AMDGPU::isChainCC(CC)) {
6973
// Chain functions don't receive an SP from their caller, but are free to
@@ -98,13 +102,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
98102
ImplicitArgPtr = true;
99103
} else {
100104
ImplicitArgPtr = false;
101-
MaxKernArgAlign = std::max(ST.getAlignmentForImplicitArgPtr(),
102-
MaxKernArgAlign);
103-
104-
if (ST.hasGFX90AInsts() &&
105-
ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
106-
!mayUseAGPRs(F))
107-
MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
105+
MaxKernArgAlign =
106+
std::max(ST.getAlignmentForImplicitArgPtr(), MaxKernArgAlign);
108107
}
109108

110109
if (!AMDGPU::isGraphics(CC) ||
@@ -783,44 +782,3 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
783782
bool SIMachineFunctionInfo::mayUseAGPRs(const Function &F) const {
784783
return !F.hasFnAttribute("amdgpu-no-agpr");
785784
}
786-
787-
bool SIMachineFunctionInfo::usesAGPRs(const MachineFunction &MF) const {
788-
if (UsesAGPRs)
789-
return *UsesAGPRs;
790-
791-
if (!mayNeedAGPRs()) {
792-
UsesAGPRs = false;
793-
return false;
794-
}
795-
796-
if (!AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv()) ||
797-
MF.getFrameInfo().hasCalls()) {
798-
UsesAGPRs = true;
799-
return true;
800-
}
801-
802-
const MachineRegisterInfo &MRI = MF.getRegInfo();
803-
804-
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
805-
const Register Reg = Register::index2VirtReg(I);
806-
const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
807-
if (RC && SIRegisterInfo::isAGPRClass(RC)) {
808-
UsesAGPRs = true;
809-
return true;
810-
}
811-
if (!RC && !MRI.use_empty(Reg) && MRI.getType(Reg).isValid()) {
812-
// Defer caching UsesAGPRs, function might not yet been regbank selected.
813-
return true;
814-
}
815-
}
816-
817-
for (MCRegister Reg : AMDGPU::AGPR_32RegClass) {
818-
if (MRI.isPhysRegUsed(Reg)) {
819-
UsesAGPRs = true;
820-
return true;
821-
}
822-
}
823-
824-
UsesAGPRs = false;
825-
return false;
826-
}

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -494,8 +494,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
494494
// scheduler stage.
495495
unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
496496

497-
mutable std::optional<bool> UsesAGPRs;
498-
499497
MCPhysReg getNextUserSGPR() const;
500498

501499
MCPhysReg getNextSystemSGPR() const;
@@ -1126,9 +1124,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
11261124
// has a call which may use it.
11271125
bool mayUseAGPRs(const Function &F) const;
11281126

1129-
// \returns true if a function needs or may need AGPRs.
1130-
bool usesAGPRs(const MachineFunction &MF) const;
1131-
11321127
/// \returns Default/requested number of work groups for this function.
11331128
SmallVector<unsigned> getMaxNumWorkGroups() const { return MaxNumWorkGroups; }
11341129

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -585,7 +585,7 @@ SIRegisterInfo::getMaxNumVectorRegs(const MachineFunction &MF) const {
585585
// TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
586586
// register file accordingly.
587587
if (ST.hasGFX90AInsts()) {
588-
if (MFI->usesAGPRs(MF)) {
588+
if (MFI->mayNeedAGPRs()) {
589589
MaxNumVGPRs /= 2;
590590
MaxNumAGPRs = MaxNumVGPRs;
591591
} else {
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
; REQUIRES: asserts
2+
; RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %s 2>&1 | FileCheck -check-prefix=CRASH %s
3+
4+
; CRASH: error: <unknown>:0:0: no registers from class available to allocate in function 'no_free_vgprs_at_agpr_to_agpr_copy'
5+
; CRASH: Cannot access invalid iterator
6+
7+
define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
8+
%asm = call { <32 x i32>, <16 x float> } asm sideeffect "; def $0 $1", "=${v[0:31]},=${a[0:15]}"()
9+
%vgpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 0
10+
%agpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 1
11+
%mfma = call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %v0, float %v1, <16 x float> %agpr0, i32 0, i32 0, i32 0)
12+
%agpr1 = call i32 asm sideeffect "; copy ", "={a1},a,~{a[0:15]}"(<16 x float> %agpr0)
13+
%agpr2 = call i32 asm sideeffect "; copy ", "={a2},a,{a[0:15]}"(i32 %agpr1, <16 x float> %mfma)
14+
call void asm sideeffect "; use $0 $1", "{a3},{v[0:31]}"(i32 %agpr2, <32 x i32> %vgpr0)
15+
ret void
16+
}
17+
18+
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32 immarg, i32 immarg, i32 immarg) #1
19+
declare noundef i32 @llvm.amdgcn.workitem.id.x() #2
20+
21+
attributes #0 = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="6,6" }
22+
attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
23+
attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
240240
}
241241

242242
; Check that we do make use of v32 if there are no AGPRs present in the function
243-
define amdgpu_kernel void @no_agpr_no_reserve(ptr addrspace(1) %arg) #0 {
243+
define amdgpu_kernel void @no_agpr_no_reserve(ptr addrspace(1) %arg) #5 {
244244
; GFX908-LABEL: no_agpr_no_reserve:
245245
; GFX908: ; %bb.0:
246246
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -1144,5 +1144,6 @@ declare i32 @llvm.amdgcn.workitem.id.x() #2
11441144
attributes #0 = { "amdgpu-waves-per-eu"="6,6" }
11451145
attributes #1 = { convergent nounwind readnone willreturn }
11461146
attributes #2 = { nounwind readnone willreturn }
1147-
attributes #3 = { "amdgpu-waves-per-eu"="7,7" }
1147+
attributes #3 = { "amdgpu-waves-per-eu"="7,7" "amdgpu-no-agpr" }
11481148
attributes #4 = { "amdgpu-waves-per-eu"="6,6" "amdgpu-flat-work-group-size"="1024,1024" }
1149+
attributes #5 = { "amdgpu-waves-per-eu"="6,6" "amdgpu-no-agpr" }

llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,9 +94,20 @@ bb3:
9494
ret void
9595
}
9696

97-
; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_nonentry:
97+
; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_nonentry_noagpr:
98+
; GFX908: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
99+
; GFX90A: v_mfma_f32_32x32x1{{.*}} v[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, v[{{[0-9:]+}}]
100+
define void @test_mfma_f32_32x32x1f32_nonentry_noagpr(ptr addrspace(1) %arg) #0 {
101+
bb:
102+
%in.1 = load <32 x float>, ptr addrspace(1) %arg
103+
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
104+
store <32 x float> %mai.1, ptr addrspace(1) %arg
105+
ret void
106+
}
107+
108+
; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_nonentry_with_agpr:
98109
; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
99-
define void @test_mfma_f32_32x32x1f32_nonentry(ptr addrspace(1) %arg) #0 {
110+
define void @test_mfma_f32_32x32x1f32_nonentry_with_agpr(ptr addrspace(1) %arg) #3 {
100111
bb:
101112
%in.1 = load <32 x float>, ptr addrspace(1) %arg
102113
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
@@ -109,3 +120,4 @@ declare void @foo()
109120
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" "amdgpu-no-agpr" }
110121
attributes #1 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" }
111122
attributes #2 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-agpr" }
123+
attributes #3 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" }

llvm/test/CodeGen/AMDGPU/spill-regpressure-less.mir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
ret void
77
}
88

9-
attributes #0 = { "amdgpu-waves-per-eu"="8,8" }
9+
attributes #0 = { "amdgpu-waves-per-eu"="8,8" "amdgpu-no-agpr" }
1010
...
1111

1212
---

0 commit comments

Comments
 (0)