Skip to content

Commit 1d543eb

Browse files
committed
[AMDGPU] Support preloading hidden kernel arguments
Adds hidden kernel arguments to the function signature and marks them inreg if they should be preloaded into user SGPRs. The normal kernarg preloading logic then takes over with some additional checks for the correct implicitarg_ptr alignment. Special care is needed so that metadata for the hidden arguments is not added twice when generating the code object.
1 parent 26ca8ef commit 1d543eb

10 files changed

+1101
-9
lines changed

llvm/docs/AMDGPUUsage.rst

+10
Original file line numberDiff line numberDiff line change
@@ -1639,6 +1639,10 @@ The AMDGPU backend supports the following LLVM IR attributes.
16391639
function which requires AGPRs is reached through any function marked
16401640
with this attribute.
16411641

1642+
"amdgpu-hidden-argument" This attribute is used internally by the backend to mark function arguments
1643+
as hidden. Hidden arguments are managed by the compiler and are not part of
1644+
the explicit arguments supplied by the user.
1645+
16421646
======================================= ==========================================================
16431647

16441648
Calling Conventions
@@ -5856,6 +5860,12 @@ may insert a trap instruction at the start of the kernel prologue to manage
58565860
situations where kernarg preloading is attempted on hardware with incompatible
58575861
firmware.
58585862

5863+
With code object V5 and later, hidden kernel arguments that are normally
5864+
accessed through the Implicit Argument Ptr, may be preloaded into User SGPRs.
5865+
These arguments are added to the kernel function signature and are marked with
5866+
the attributes "inreg" and "amdgpu-hidden-argument". (See
5867+
:ref:`amdgpu-llvm-ir-attributes-table`).
5868+
58595869
.. _amdgpu-amdhsa-kernel-prolog:
58605870

58615871
Kernel Prolog

llvm/include/llvm/IR/Argument.h

+2
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,8 @@ class Argument final : public Value {
178178
/// Check if an argument has a given attribute.
179179
bool hasAttribute(Attribute::AttrKind Kind) const;
180180

181+
bool hasAttribute(StringRef Kind) const;
182+
181183
Attribute getAttribute(Attribute::AttrKind Kind) const;
182184

183185
/// Method for support type inquiry through isa, cast, and dyn_cast.

llvm/include/llvm/IR/Function.h

+3
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,9 @@ class LLVM_ABI Function : public GlobalObject, public ilist_node<Function> {
433433
/// check if an attributes is in the list of attributes.
434434
bool hasParamAttribute(unsigned ArgNo, Attribute::AttrKind Kind) const;
435435

436+
/// Check if an attribute is in the list of attributes.
437+
bool hasParamAttribute(unsigned ArgNo, StringRef Kind) const;
438+
436439
/// gets the attribute from the list of attributes.
437440
Attribute getAttributeAtIndex(unsigned i, Attribute::AttrKind Kind) const;
438441

llvm/lib/IR/Function.cpp

+8
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,10 @@ bool Argument::hasAttribute(Attribute::AttrKind Kind) const {
351351
return getParent()->hasParamAttribute(getArgNo(), Kind);
352352
}
353353

354+
bool Argument::hasAttribute(StringRef Kind) const {
355+
return getParent()->hasParamAttribute(getArgNo(), Kind);
356+
}
357+
354358
Attribute Argument::getAttribute(Attribute::AttrKind Kind) const {
355359
return getParent()->getParamAttribute(getArgNo(), Kind);
356360
}
@@ -738,6 +742,10 @@ bool Function::hasParamAttribute(unsigned ArgNo,
738742
return AttributeSets.hasParamAttr(ArgNo, Kind);
739743
}
740744

745+
bool Function::hasParamAttribute(unsigned ArgNo, StringRef Kind) const {
746+
return AttributeSets.hasParamAttr(ArgNo, Kind);
747+
}
748+
741749
Attribute Function::getAttributeAtIndex(unsigned i,
742750
Attribute::AttrKind Kind) const {
743751
return AttributeSets.getAttributeAtIndex(i, Kind);

llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp

+5-1
Original file line numberDiff line numberDiff line change
@@ -260,8 +260,12 @@ void MetadataStreamerMsgPackV4::emitKernelArgs(const MachineFunction &MF,
260260
auto &Func = MF.getFunction();
261261
unsigned Offset = 0;
262262
auto Args = HSAMetadataDoc->getArrayNode();
263-
for (auto &Arg : Func.args())
263+
for (auto &Arg : Func.args()) {
264+
if (Arg.hasAttribute("amdgpu-hidden-argument"))
265+
continue;
266+
264267
emitKernelArg(Arg, Offset, Args);
268+
}
265269

266270
emitHiddenKernelArgs(MF, Offset, Args);
267271

llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp

+200-2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313

1414
#include "AMDGPU.h"
1515
#include "GCNSubtarget.h"
16+
#include "llvm/ADT/StringExtras.h"
17+
#include "llvm/Analysis/ValueTracking.h"
1618
#include "llvm/CodeGen/TargetPassConfig.h"
1719
#include "llvm/IR/IRBuilder.h"
1820
#include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -31,9 +33,110 @@ class PreloadKernelArgInfo {
3133
const GCNSubtarget &ST;
3234
unsigned NumFreeUserSGPRs;
3335

34-
public:
35-
SmallVector<llvm::Metadata *, 8> KernelArgMetadata;
36+
enum HiddenArg : unsigned {
37+
HIDDEN_BLOCK_COUNT_X,
38+
HIDDEN_BLOCK_COUNT_Y,
39+
HIDDEN_BLOCK_COUNT_Z,
40+
HIDDEN_GROUP_SIZE_X,
41+
HIDDEN_GROUP_SIZE_Y,
42+
HIDDEN_GROUP_SIZE_Z,
43+
HIDDEN_REMAINDER_X,
44+
HIDDEN_REMAINDER_Y,
45+
HIDDEN_REMAINDER_Z,
46+
END_HIDDEN_ARGS
47+
};
48+
49+
// Stores information about a specific hidden argument.
50+
struct HiddenArgInfo {
51+
// Offset in bytes from the location in the kernearg segment pointed to by
52+
// the implicitarg pointer.
53+
uint8_t Offset;
54+
// The size of the hidden argument in bytes.
55+
uint8_t Size;
56+
// The name of the hidden argument in the kernel signature.
57+
const char *Name;
58+
};
59+
60+
static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
61+
{0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},
62+
{8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},
63+
{14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},
64+
{18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"},
65+
{22, 2, "_hidden_remainder_z"}};
66+
67+
static HiddenArg getHiddenArgFromOffset(unsigned Offset) {
68+
for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
69+
if (HiddenArgs[I].Offset == Offset)
70+
return static_cast<HiddenArg>(I);
71+
72+
return END_HIDDEN_ARGS;
73+
}
74+
75+
static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {
76+
if (HA < END_HIDDEN_ARGS)
77+
return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8);
78+
79+
llvm_unreachable("Unexpected hidden argument.");
80+
}
81+
82+
static const char *getHiddenArgName(HiddenArg HA) {
83+
if (HA < END_HIDDEN_ARGS) {
84+
return HiddenArgs[HA].Name;
85+
}
86+
llvm_unreachable("Unexpected hidden argument.");
87+
}
3688

89+
// Clones the function after adding implicit arguments to the argument list
90+
// and returns the new updated function. Preloaded implicit arguments are
91+
// added up to and including the last one that will be preloaded, indicated by
92+
// LastPreloadIndex. Currently preloading is only performed on the totality of
93+
// sequential data from the kernarg segment including implicit (hidden)
94+
// arguments. This means that all arguments up to the last preloaded argument
95+
// will also be preloaded even if that data is unused.
96+
Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) {
97+
FunctionType *FT = F.getFunctionType();
98+
LLVMContext &Ctx = F.getParent()->getContext();
99+
SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end());
100+
for (unsigned I = 0; I <= LastPreloadIndex; ++I)
101+
FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I)));
102+
103+
FunctionType *NFT =
104+
FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());
105+
Function *NF =
106+
Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName());
107+
108+
NF->copyAttributesFrom(&F);
109+
NF->copyMetadata(&F, 0);
110+
NF->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat);
111+
112+
F.getParent()->getFunctionList().insert(F.getIterator(), NF);
113+
NF->takeName(&F);
114+
NF->splice(NF->begin(), &F);
115+
116+
Function::arg_iterator NFArg = NF->arg_begin();
117+
for (Argument &Arg : F.args()) {
118+
Arg.replaceAllUsesWith(&*NFArg);
119+
NFArg->takeName(&Arg);
120+
++NFArg;
121+
}
122+
123+
AttrBuilder AB(Ctx);
124+
AB.addAttribute(Attribute::InReg);
125+
AB.addAttribute("amdgpu-hidden-argument");
126+
AttributeList AL = NF->getAttributes();
127+
for (unsigned I = 0; I <= LastPreloadIndex; ++I) {
128+
AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB);
129+
NFArg++->setName(getHiddenArgName(HiddenArg(I)));
130+
}
131+
132+
NF->setAttributes(AL);
133+
F.replaceAllUsesWith(NF);
134+
F.setCallingConv(CallingConv::C);
135+
136+
return NF;
137+
}
138+
139+
public:
37140
PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
38141
setInitialFreeUserSGPRsCount();
39142
}
@@ -64,6 +167,87 @@ class PreloadKernelArgInfo {
64167
NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
65168
return true;
66169
}
170+
171+
// Try to allocate SGPRs to preload implicit kernel arguments.
172+
void tryAllocImplicitArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
173+
IRBuilder<> &Builder) {
174+
StringRef Name = Intrinsic::getName(Intrinsic::amdgcn_implicitarg_ptr);
175+
Function *ImplicitArgPtr = F.getParent()->getFunction(Name);
176+
if (!ImplicitArgPtr)
177+
return;
178+
179+
const DataLayout &DL = F.getParent()->getDataLayout();
180+
// Pair is the load and the load offset.
181+
SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads;
182+
for (auto *U : ImplicitArgPtr->users()) {
183+
Instruction *CI = dyn_cast<Instruction>(U);
184+
if (!CI || CI->getParent()->getParent() != &F)
185+
continue;
186+
187+
for (auto *U : CI->users()) {
188+
int64_t Offset = 0;
189+
auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
190+
if (!Load) {
191+
if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
192+
continue;
193+
194+
Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
195+
}
196+
197+
if (!Load || !Load->isSimple())
198+
continue;
199+
200+
// FIXME: Expand to handle 64-bit implicit args and large merged loads.
201+
LLVMContext &Ctx = F.getParent()->getContext();
202+
Type *LoadTy = Load->getType();
203+
HiddenArg HA = getHiddenArgFromOffset(Offset);
204+
if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))
205+
continue;
206+
207+
ImplicitArgLoads.push_back(std::make_pair(Load, Offset));
208+
}
209+
}
210+
211+
if (ImplicitArgLoads.empty())
212+
return;
213+
214+
// Allocate loads in order of offset. We need to be sure that the implicit
215+
// argument can actually be preloaded.
216+
std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second());
217+
218+
uint64_t LastExplicitArgOffset = ImplicitArgsBaseOffset;
219+
// If we fail to preload any implicit argument we know we don't have SGPRs
220+
// to preload any subsequent ones with larger offsets. Find the first
221+
// argument that we cannot preload.
222+
auto *PreloadEnd = std::find_if(
223+
ImplicitArgLoads.begin(), ImplicitArgLoads.end(),
224+
[&](const std::pair<LoadInst *, unsigned> &Load) {
225+
unsigned LoadSize = DL.getTypeStoreSize(Load.first->getType());
226+
unsigned LoadOffset = Load.second;
227+
if (!tryAllocPreloadSGPRs(LoadSize,
228+
LoadOffset + ImplicitArgsBaseOffset,
229+
LastExplicitArgOffset))
230+
return true;
231+
232+
LastExplicitArgOffset = LoadOffset + LoadSize;
233+
return false;
234+
});
235+
236+
if (PreloadEnd == ImplicitArgLoads.begin())
237+
return;
238+
239+
unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second);
240+
Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex);
241+
assert(NF);
242+
for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) {
243+
LoadInst *LoadInst = I->first;
244+
unsigned LoadOffset = I->second;
245+
unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset);
246+
unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1;
247+
Argument *Arg = NF->getArg(Index);
248+
LoadInst->replaceAllUsesWith(Arg);
249+
}
250+
}
67251
};
68252

69253
class AMDGPULowerKernelArguments : public FunctionPass {
@@ -142,6 +326,12 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
142326
uint64_t LastExplicitArgOffset = ExplicitArgOffset;
143327
ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
144328

329+
// Guard against the situation where hidden arguments have already been
330+
// lowered and added to the kernel function signiture, i.e. in a situation
331+
// where this pass has run twice.
332+
if (Arg.hasAttribute("amdgpu-hidden-argument"))
333+
break;
334+
145335
// Try to preload this argument into user SGPRs.
146336
if (Arg.hasInRegAttr() && InPreloadSequence && ST.hasKernargPreload() &&
147337
!Arg.getType()->isAggregateType())
@@ -281,6 +471,14 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
281471
KernArgSegment->addRetAttr(
282472
Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
283473

474+
if (InPreloadSequence) {
475+
uint64_t ImplicitArgsBaseOffset =
476+
alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +
477+
BaseOffset;
478+
PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset,
479+
Builder);
480+
}
481+
284482
return true;
285483
}
286484

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,9 @@ uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
314314
MaxAlign = Align(1);
315315

316316
for (const Argument &Arg : F.args()) {
317+
if (Arg.hasAttribute("amdgpu-hidden-argument"))
318+
continue;
319+
317320
const bool IsByRef = Arg.hasByRefAttr();
318321
Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
319322
Align Alignment = DL.getValueOrABITypeAlignment(

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

+19-6
Original file line numberDiff line numberDiff line change
@@ -2510,24 +2510,25 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
25102510
const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
25112511
const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
25122512
Function &F = MF.getFunction();
2513-
unsigned LastExplicitArgOffset =
2514-
MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset();
2513+
unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
25152514
GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
25162515
bool InPreloadSequence = true;
25172516
unsigned InIdx = 0;
2517+
bool AlignedForImplictArgs = false;
2518+
unsigned ImplicitArgOffset = 0;
25182519
for (auto &Arg : F.args()) {
25192520
if (!InPreloadSequence || !Arg.hasInRegAttr())
25202521
break;
25212522

2522-
int ArgIdx = Arg.getArgNo();
2523+
unsigned ArgIdx = Arg.getArgNo();
25232524
// Don't preload non-original args or parts not in the current preload
25242525
// sequence.
2525-
if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2526-
(int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2526+
if (InIdx < Ins.size() &&
2527+
(!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
25272528
break;
25282529

25292530
for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2530-
(int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2531+
Ins[InIdx].getOrigArgIndex() == ArgIdx;
25312532
InIdx++) {
25322533
assert(ArgLocs[ArgIdx].isMemLoc());
25332534
auto &ArgLoc = ArgLocs[InIdx];
@@ -2537,6 +2538,18 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
25372538
unsigned NumAllocSGPRs =
25382539
alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
25392540

2541+
// Fix alignment for hidden arguments.
2542+
if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2543+
if (!AlignedForImplictArgs) {
2544+
ImplicitArgOffset =
2545+
alignTo(LastExplicitArgOffset,
2546+
Subtarget->getAlignmentForImplicitArgPtr()) -
2547+
LastExplicitArgOffset;
2548+
AlignedForImplictArgs = true;
2549+
}
2550+
ArgOffset += ImplicitArgOffset;
2551+
}
2552+
25402553
// Arg is preloaded into the previous SGPR.
25412554
if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
25422555
Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(

0 commit comments

Comments
 (0)