Skip to content

Commit 06b51c2

Browse files
committed
[WIP][AMDGPU][Attributor] Infer inreg attribute in AMDGPUAttributor
1 parent 0ba006d commit 06b51c2

File tree

4 files changed

+202
-4
lines changed

4 files changed

+202
-4
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp

Lines changed: 104 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "GCNSubtarget.h"
1515
#include "Utils/AMDGPUBaseInfo.h"
1616
#include "llvm/Analysis/CycleAnalysis.h"
17+
#include "llvm/Analysis/UniformityAnalysis.h"
1718
#include "llvm/CodeGen/TargetPassConfig.h"
1819
#include "llvm/IR/IntrinsicsAMDGPU.h"
1920
#include "llvm/IR/IntrinsicsR600.h"
@@ -1014,6 +1015,102 @@ struct AAAMDGPUNoAGPR
10141015

10151016
const char AAAMDGPUNoAGPR::ID = 0;
10161017

1018+
struct AAAMDGPUInreg
1019+
: public IRAttribute<Attribute::InReg,
1020+
StateWrapper<BooleanState, AbstractAttribute>,
1021+
AAAMDGPUInreg> {
1022+
AAAMDGPUInreg(const IRPosition &IRP, Attributor &A) : IRAttribute(IRP) {}
1023+
1024+
/// Create an abstract attribute view for the position \p IRP.
1025+
static AAAMDGPUInreg &createForPosition(const IRPosition &IRP, Attributor &A);
1026+
1027+
/// See AbstractAttribute::getName()
1028+
const std::string getName() const override { return "AAAMDGPUInreg"; }
1029+
1030+
const std::string getAsStr(Attributor *A) const override {
1031+
return getAssumed() ? "inreg" : "non-inreg";
1032+
}
1033+
1034+
void trackStatistics() const override {}
1035+
1036+
/// See AbstractAttribute::getIdAddr()
1037+
const char *getIdAddr() const override { return &ID; }
1038+
1039+
/// This function should return true if the type of the \p AA is AAAMDGPUInreg
1040+
static bool classof(const AbstractAttribute *AA) {
1041+
return (AA->getIdAddr() == &ID);
1042+
}
1043+
1044+
/// Unique ID (due to the unique address)
1045+
static const char ID;
1046+
};
1047+
1048+
const char AAAMDGPUInreg::ID = 0;
1049+
1050+
namespace {
1051+
1052+
struct AAAMDGPUInregArgument : public AAAMDGPUInreg {
1053+
AAAMDGPUInregArgument(const IRPosition &IRP, Attributor &A)
1054+
: AAAMDGPUInreg(IRP, A) {}
1055+
1056+
void initialize(Attributor &A) override {
1057+
assert(getAssociatedFunction()->getCallingConv() !=
1058+
CallingConv::AMDGPU_KERNEL);
1059+
if (getAssociatedArgument()->hasAttribute(Attribute::InReg))
1060+
indicateOptimisticFixpoint();
1061+
}
1062+
1063+
ChangeStatus updateImpl(Attributor &A) override {
1064+
unsigned ArgNo = getAssociatedArgument()->getArgNo();
1065+
1066+
auto Pred = [&](AbstractCallSite ACS) -> bool {
1067+
CallBase *CB = ACS.getInstruction();
1068+
Value *V = CB->getArgOperandUse(ArgNo);
1069+
if (isa<Constant>(V))
1070+
return true;
1071+
if (auto *I = dyn_cast<Instruction>(V)) {
1072+
auto AU = A.getInfoCache()
1073+
.getAnalysisResultForFunction<UniformityInfoAnalysis>(
1074+
*I->getFunction());
1075+
return AU && AU->isUniform(I);
1076+
}
1077+
if (auto *Arg = dyn_cast<Argument>(V)) {
1078+
// We assume all kernel arguments are uniform.
1079+
if (Arg->getParent()->getCallingConv() == CallingConv::AMDGPU_KERNEL)
1080+
return true;
1081+
auto *AA =
1082+
A.getOrCreateAAFor<AAAMDGPUInreg>(IRPosition::argument(*Arg));
1083+
return AA && AA->isValidState();
1084+
}
1085+
// For unforeseen cases, we need to assume it is not uniform thus not
1086+
// qualified for inreg.
1087+
return false;
1088+
};
1089+
1090+
bool UsedAssumedInformation = false;
1091+
if (!A.checkForAllCallSites(Pred, *this, /*RequireAllCallSites=*/true,
1092+
UsedAssumedInformation))
1093+
return indicatePessimisticFixpoint();
1094+
1095+
if (!UsedAssumedInformation)
1096+
return indicateOptimisticFixpoint();
1097+
1098+
return ChangeStatus::UNCHANGED;
1099+
}
1100+
};
1101+
1102+
} // namespace
1103+
1104+
AAAMDGPUInreg &AAAMDGPUInreg::createForPosition(const IRPosition &IRP,
1105+
Attributor &A) {
1106+
switch (IRP.getPositionKind()) {
1107+
case IRPosition::IRP_ARGUMENT:
1108+
return *new (A.Allocator) AAAMDGPUInregArgument(IRP, A);
1109+
default:
1110+
llvm_unreachable("not a valid position for AAAMDGPUInreg");
1111+
}
1112+
}
1113+
10171114
static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
10181115
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
10191116
for (unsigned I = 0;
@@ -1046,7 +1143,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
10461143
&AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID,
10471144
&AAPointerInfo::ID, &AAPotentialConstantValues::ID,
10481145
&AAUnderlyingObjects::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
1049-
&AAInstanceInfo::ID});
1146+
&AAInstanceInfo::ID, &AAAMDGPUInreg::ID});
10501147

10511148
AttributorConfig AC(CGUpdater);
10521149
AC.IsClosedWorldModule = Options.IsClosedWorld;
@@ -1090,6 +1187,11 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
10901187
IRPosition::value(*SI->getPointerOperand()));
10911188
}
10921189
}
1190+
1191+
if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL) {
1192+
for (auto &Arg : F.args())
1193+
A.getOrCreateAAFor<AAAMDGPUInreg>(IRPosition::argument(Arg));
1194+
}
10931195
}
10941196

10951197
ChangeStatus Change = A.run();
@@ -1118,6 +1220,7 @@ class AMDGPUAttributorLegacy : public ModulePass {
11181220

11191221
void getAnalysisUsage(AnalysisUsage &AU) const override {
11201222
AU.addRequired<CycleInfoWrapperPass>();
1223+
AU.addRequired<UniformityInfoWrapperPass>();
11211224
}
11221225

11231226
StringRef getPassName() const override { return "AMDGPU Attributor"; }

llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
define internal fastcc void @foo(ptr %kg) {
1010
; CHECK-LABEL: define internal fastcc void @foo(
11-
; CHECK-SAME: ptr [[KG:%.*]]) #[[ATTR0:[0-9]+]] {
11+
; CHECK-SAME: ptr inreg [[KG:%.*]]) #[[ATTR0:[0-9]+]] {
1212
; CHECK-NEXT: [[ENTRY:.*:]]
1313
; CHECK-NEXT: [[CLOSURE_I25_I:%.*]] = getelementptr i8, ptr [[KG]], i64 336
1414
; CHECK-NEXT: [[NUM_CLOSURE_I26_I:%.*]] = getelementptr i8, ptr [[KG]], i64 276
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
2+
; RUN: opt -S -mtriple=amdgcn-unknown-unknown -passes=amdgpu-attributor %s -o - | FileCheck %s
3+
4+
@g1 = protected addrspace(1) externally_initialized global i32 0, align 4
5+
@g2 = protected addrspace(1) externally_initialized global i32 0, align 4
6+
@g3 = protected addrspace(1) externally_initialized global i32 0, align 4
7+
@g4 = protected addrspace(1) externally_initialized global i32 0, align 4
8+
9+
;.
10+
; CHECK: @g1 = protected addrspace(1) externally_initialized global i32 0, align 4
11+
; CHECK: @g2 = protected addrspace(1) externally_initialized global i32 0, align 4
12+
; CHECK: @g3 = protected addrspace(1) externally_initialized global i32 0, align 4
13+
; CHECK: @g4 = protected addrspace(1) externally_initialized global i32 0, align 4
14+
;.
15+
define internal fastcc void @callee_infer(ptr addrspace(1) %x, i32 %y) {
16+
; CHECK-LABEL: define {{[^@]+}}@callee_infer
17+
; CHECK-SAME: (ptr addrspace(1) inreg [[X:%.*]], i32 inreg [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
18+
; CHECK-NEXT: entry:
19+
; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr addrspace(1) [[X]], align 4
20+
; CHECK-NEXT: store i32 [[X_VAL]], ptr addrspace(1) @g3, align 4
21+
; CHECK-NEXT: store i32 [[Y]], ptr addrspace(1) @g4, align 4
22+
; CHECK-NEXT: ret void
23+
;
24+
entry:
25+
%x.val = load i32, ptr addrspace(1) %x, align 4
26+
store i32 %x.val, ptr addrspace(1) @g3, align 4
27+
store i32 %y, ptr addrspace(1) @g4, align 4
28+
ret void
29+
}
30+
31+
define protected amdgpu_kernel void @kernel_infer(ptr addrspace(1) %p1, ptr addrspace(1) %p2, i32 %x) {
32+
; CHECK-LABEL: define {{[^@]+}}@kernel_infer
33+
; CHECK-SAME: (ptr addrspace(1) [[P1:%.*]], ptr addrspace(1) [[P2:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
34+
; CHECK-NEXT: entry:
35+
; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[X]], 0
36+
; CHECK-NEXT: [[P:%.*]] = select i1 [[CMP]], ptr addrspace(1) [[P1]], ptr addrspace(1) [[P2]]
37+
; CHECK-NEXT: tail call fastcc void @callee_infer(ptr addrspace(1) @g1, i32 [[X]])
38+
; CHECK-NEXT: tail call fastcc void @callee_infer(ptr addrspace(1) @g2, i32 [[X]])
39+
; CHECK-NEXT: tail call fastcc void @callee_infer(ptr addrspace(1) @g1, i32 1)
40+
; CHECK-NEXT: tail call fastcc void @callee_infer(ptr addrspace(1) @g2, i32 2)
41+
; CHECK-NEXT: tail call fastcc void @callee_infer(ptr addrspace(1) [[P]], i32 [[X]])
42+
; CHECK-NEXT: ret void
43+
;
44+
entry:
45+
%cmp = icmp sgt i32 %x, 0
46+
%p = select i1 %cmp, ptr addrspace(1) %p1, ptr addrspace(1) %p2
47+
tail call fastcc void @callee_infer(ptr addrspace(1) @g1, i32 %x)
48+
tail call fastcc void @callee_infer(ptr addrspace(1) @g2, i32 %x)
49+
tail call fastcc void @callee_infer(ptr addrspace(1) @g1, i32 1)
50+
tail call fastcc void @callee_infer(ptr addrspace(1) @g2, i32 2)
51+
tail call fastcc void @callee_infer(ptr addrspace(1) %p, i32 %x)
52+
ret void
53+
}
54+
55+
define internal fastcc void @callee_not_infer(ptr addrspace(1) %x, i32 %y) {
56+
; CHECK-LABEL: define {{[^@]+}}@callee_not_infer
57+
; CHECK-SAME: (ptr addrspace(1) [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] {
58+
; CHECK-NEXT: entry:
59+
; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr addrspace(1) [[X]], align 4
60+
; CHECK-NEXT: store i32 [[X_VAL]], ptr addrspace(1) @g3, align 4
61+
; CHECK-NEXT: store i32 [[Y]], ptr addrspace(1) @g4, align 4
62+
; CHECK-NEXT: ret void
63+
;
64+
entry:
65+
%x.val = load i32, ptr addrspace(1) %x, align 4
66+
store i32 %x.val, ptr addrspace(1) @g3, align 4
67+
store i32 %y, ptr addrspace(1) @g4, align 4
68+
ret void
69+
}
70+
71+
define protected amdgpu_kernel void @kernel_not_infer(ptr %q, ptr addrspace(1) %p1, ptr addrspace(1) %p2) {
72+
; CHECK-LABEL: define {{[^@]+}}@kernel_not_infer
73+
; CHECK-SAME: (ptr [[Q:%.*]], ptr addrspace(1) [[P1:%.*]], ptr addrspace(1) [[P2:%.*]]) #[[ATTR0]] {
74+
; CHECK-NEXT: entry:
75+
; CHECK-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
76+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[Q]], i32 [[ID_X]]
77+
; CHECK-NEXT: [[D:%.*]] = load i32, ptr [[GEP]], align 4
78+
; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[D]], 0
79+
; CHECK-NEXT: [[P:%.*]] = select i1 [[CMP]], ptr addrspace(1) [[P1]], ptr addrspace(1) [[P2]]
80+
; CHECK-NEXT: tail call fastcc void @callee_not_infer(ptr addrspace(1) [[P]], i32 [[ID_X]])
81+
; CHECK-NEXT: ret void
82+
;
83+
entry:
84+
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
85+
%gep = getelementptr i32, ptr %q, i32 %id.x
86+
%d = load i32, ptr %gep
87+
%cmp = icmp sgt i32 %d, 0
88+
%p = select i1 %cmp, ptr addrspace(1) %p1, ptr addrspace(1) %p2
89+
tail call fastcc void @callee_not_infer(ptr addrspace(1) %p, i32 %id.x)
90+
ret void
91+
}
92+
;.
93+
; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
94+
; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
95+
;.

llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ define amdgpu_kernel void @kernel_lds() {
148148

149149
define internal i16 @mutual_recursion_0(i16 %arg) {
150150
; CHECK-LABEL: define internal i16 @mutual_recursion_0(
151-
; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR0]] {
151+
; CHECK-SAME: i16 inreg [[ARG:%.*]]) #[[ATTR0]] {
152152
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
153153
; CHECK-NEXT: [[RECURSIVE_KERNEL_LDS:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
154154
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[RECURSIVE_KERNEL_LDS]], align 4
@@ -168,7 +168,7 @@ define internal i16 @mutual_recursion_0(i16 %arg) {
168168

169169
define internal void @mutual_recursion_1(i16 %arg) {
170170
; CHECK-LABEL: define internal void @mutual_recursion_1(
171-
; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR0]] {
171+
; CHECK-SAME: i16 inreg [[ARG:%.*]]) #[[ATTR0]] {
172172
; CHECK-NEXT: call void @mutual_recursion_0(i16 [[ARG]])
173173
; CHECK-NEXT: ret void
174174
;

0 commit comments

Comments
 (0)