Skip to content

Commit b98dee8

Browse files
committed
[AMDGPU] Support preloading hidden kernel arguments
Adds hidden kernel arguments to the function signature and marks them inreg if they should be preloaded into user SGPRs. The normal kernarg preloading logic then takes over with some additional checks for the correct implicitarg_ptr alignment. Special care is needed so that metadata for the hidden arguments is not added twice when generating the code object.
1 parent 7dbc664 commit b98dee8

8 files changed

+1061
-7
lines changed

llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp

+7-1
Original file line numberDiff line numberDiff line change
@@ -260,8 +260,14 @@ void MetadataStreamerMsgPackV4::emitKernelArgs(const MachineFunction &MF,
260260
auto &Func = MF.getFunction();
261261
unsigned Offset = 0;
262262
auto Args = HSAMetadataDoc->getArrayNode();
263-
for (auto &Arg : Func.args())
263+
for (auto &Arg : Func.args()) {
264+
if (Func.getAttributes().hasAttributeAtIndex(AttributeList::FirstArgIndex +
265+
Arg.getArgNo(),
266+
"amdgpu-hidden-argument"))
267+
continue;
268+
264269
emitKernelArg(Arg, Offset, Args);
270+
}
265271

266272
emitHiddenKernelArgs(MF, Offset, Args);
267273

llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp

+197-2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313

1414
#include "AMDGPU.h"
1515
#include "GCNSubtarget.h"
16+
#include "llvm/ADT/StringExtras.h"
17+
#include "llvm/Analysis/ValueTracking.h"
1618
#include "llvm/CodeGen/TargetPassConfig.h"
1719
#include "llvm/IR/IRBuilder.h"
1820
#include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -31,9 +33,109 @@ class PreloadKernelArgInfo {
3133
const GCNSubtarget &ST;
3234
unsigned NumFreeUserSGPRs;
3335

34-
public:
35-
SmallVector<llvm::Metadata *, 8> KernelArgMetadata;
36+
enum HiddenArg : unsigned {
37+
HIDDEN_BLOCK_COUNT_X,
38+
HIDDEN_BLOCK_COUNT_Y,
39+
HIDDEN_BLOCK_COUNT_Z,
40+
HIDDEN_GROUP_SIZE_X,
41+
HIDDEN_GROUP_SIZE_Y,
42+
HIDDEN_GROUP_SIZE_Z,
43+
HIDDEN_REMAINDER_X,
44+
HIDDEN_REMAINDER_Y,
45+
HIDDEN_REMAINDER_Z,
46+
END_HIDDEN_ARGS
47+
};
48+
49+
// Stores information about a specific hidden argument.
50+
struct HiddenArgInfo {
51+
// Offset in bytes from the location in the kernearg segment pointed to by
52+
// the implicitarg pointer.
53+
uint8_t Offset;
54+
// The size of the hidden argument in bytes.
55+
uint8_t Size;
56+
// The name of the hidden argument in the kernel signature.
57+
const char *Name;
58+
};
59+
60+
static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
61+
{0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},
62+
{8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},
63+
{14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},
64+
{18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"},
65+
{22, 2, "_hidden_remainder_z"}};
66+
67+
static HiddenArg getHiddenArgFromOffset(unsigned Offset) {
68+
for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
69+
if (HiddenArgs[I].Offset == Offset)
70+
return static_cast<HiddenArg>(I);
71+
72+
return END_HIDDEN_ARGS;
73+
}
74+
75+
static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {
76+
if (HA < END_HIDDEN_ARGS)
77+
return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8);
78+
79+
llvm_unreachable("Unexpected hidden argument.");
80+
}
81+
82+
static const char *getHiddenArgName(HiddenArg HA) {
83+
if (HA < END_HIDDEN_ARGS) {
84+
return HiddenArgs[HA].Name;
85+
}
86+
llvm_unreachable("Unexpected hidden argument.");
87+
}
88+
89+
// Clones the function after adding implicit arguments to the argument list
90+
// and returns the new updated function. Preloaded implicit arguments are
91+
// added up to and including the last one that will be preloaded, indicated by
92+
// LastPreloadIndex. Currently preloading is only performed on the totality of
93+
// sequential data from the kernarg segment including implicit (hidden)
94+
// arguments. This means that all arguments up to the last preloaded argument
95+
// will also be preloaded even if that data is unused.
96+
Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) {
97+
FunctionType *FT = F.getFunctionType();
98+
LLVMContext &Ctx = F.getParent()->getContext();
99+
SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end());
100+
for (unsigned I = 0; I <= LastPreloadIndex; ++I)
101+
FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I)));
102+
103+
FunctionType *NFT =
104+
FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());
105+
Function *NF =
106+
Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName());
107+
108+
NF->copyAttributesFrom(&F);
109+
NF->copyMetadata(&F, 0);
110+
NF->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat);
111+
112+
F.getParent()->getFunctionList().insert(F.getIterator(), NF);
113+
NF->takeName(&F);
114+
NF->splice(NF->begin(), &F);
115+
116+
Function::arg_iterator NFArg = NF->arg_begin();
117+
for (Argument &Arg : F.args()) {
118+
Arg.replaceAllUsesWith(&*NFArg);
119+
NFArg->takeName(&Arg);
120+
++NFArg;
121+
}
122+
123+
AttrBuilder AB(Ctx);
124+
AB.addAttribute(Attribute::InReg);
125+
AB.addAttribute("amdgpu-hidden-argument");
126+
AttributeList AL = NF->getAttributes();
127+
for (unsigned I = 0; I <= LastPreloadIndex; ++I) {
128+
AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB);
129+
NFArg++->setName(getHiddenArgName(HiddenArg(I)));
130+
}
131+
132+
NF->setAttributes(AL);
133+
F.replaceAllUsesWith(NF);
134+
135+
return NF;
136+
}
36137

138+
public:
37139
PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
38140
setInitialFreeUserSGPRsCount();
39141
}
@@ -64,6 +166,91 @@ class PreloadKernelArgInfo {
64166
NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
65167
return true;
66168
}
169+
170+
// Try to allocate SGPRs to preload implicit kernel arguments.
171+
void tryAllocImplicitArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
172+
IRBuilder<> &Builder) {
173+
StringRef Name = Intrinsic::getName(Intrinsic::amdgcn_implicitarg_ptr);
174+
Function *ImplicitArgPtr = F.getParent()->getFunction(Name);
175+
if (!ImplicitArgPtr)
176+
return;
177+
178+
const DataLayout &DL = F.getParent()->getDataLayout();
179+
// Pair is the load and the load offset.
180+
SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads;
181+
for (auto *U : ImplicitArgPtr->users()) {
182+
Instruction *CI = dyn_cast<Instruction>(U);
183+
if (!CI || CI->getParent()->getParent() != &F)
184+
continue;
185+
186+
for (auto *U : CI->users()) {
187+
int64_t Offset = 0;
188+
auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
189+
if (!Load) {
190+
if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
191+
continue;
192+
193+
Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
194+
}
195+
196+
if (!Load || !Load->isSimple())
197+
continue;
198+
199+
// FIXME: Expand to handle 64-bit implicit args and large merged loads.
200+
LLVMContext &Ctx = F.getParent()->getContext();
201+
Type *LoadTy = Load->getType();
202+
HiddenArg HA = getHiddenArgFromOffset(Offset);
203+
if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))
204+
continue;
205+
206+
ImplicitArgLoads.push_back(std::make_pair(Load, Offset));
207+
}
208+
}
209+
210+
if (ImplicitArgLoads.empty())
211+
return;
212+
213+
// Allocate loads in order of offset. We need to be sure that the implicit
214+
// argument can actually be preloaded.
215+
std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(),
216+
[](const std::pair<LoadInst *, unsigned> &A,
217+
const std::pair<LoadInst *, unsigned> &B) {
218+
return A.second < B.second;
219+
});
220+
221+
uint64_t LastExplicitArgOffset = ImplicitArgsBaseOffset;
222+
// If we fail to preload any implicit argument we know we don't have SGPRs
223+
// to preload any subsequent ones with larger offsets. Find the first
224+
// argument that we cannot preload.
225+
auto *PreloadEnd = std::find_if(
226+
ImplicitArgLoads.begin(), ImplicitArgLoads.end(),
227+
[&](const std::pair<LoadInst *, unsigned> &Load) {
228+
unsigned LoadSize = DL.getTypeStoreSize(Load.first->getType());
229+
unsigned LoadOffset = Load.second;
230+
if (!tryAllocPreloadSGPRs(LoadSize,
231+
LoadOffset + ImplicitArgsBaseOffset,
232+
LastExplicitArgOffset))
233+
return true;
234+
235+
LastExplicitArgOffset = LoadOffset + LoadSize;
236+
return false;
237+
});
238+
239+
if (PreloadEnd == ImplicitArgLoads.begin())
240+
return;
241+
242+
unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second);
243+
Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex);
244+
assert(NF);
245+
for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) {
246+
LoadInst *LoadInst = I->first;
247+
unsigned LoadOffset = I->second;
248+
unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset);
249+
unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1;
250+
Argument *Arg = NF->getArg(Index);
251+
LoadInst->replaceAllUsesWith(Arg);
252+
}
253+
}
67254
};
68255

69256
class AMDGPULowerKernelArguments : public FunctionPass {
@@ -281,6 +468,14 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
281468
KernArgSegment->addRetAttr(
282469
Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
283470

471+
if (InPreloadSequence) {
472+
uint64_t ImplicitArgsBaseOffset =
473+
alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +
474+
BaseOffset;
475+
PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset,
476+
Builder);
477+
}
478+
284479
return true;
285480
}
286481

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

+5
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,11 @@ uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
314314
MaxAlign = Align(1);
315315

316316
for (const Argument &Arg : F.args()) {
317+
if (F.getAttributes().hasAttributeAtIndex(AttributeList::FirstArgIndex +
318+
Arg.getArgNo(),
319+
"amdgpu-hidden-argument"))
320+
continue;
321+
317322
const bool IsByRef = Arg.hasByRefAttr();
318323
Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
319324
Align Alignment = DL.getValueOrABITypeAlignment(

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

+22-4
Original file line numberDiff line numberDiff line change
@@ -2515,19 +2515,20 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
25152515
GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
25162516
bool InPreloadSequence = true;
25172517
unsigned InIdx = 0;
2518+
bool AlignedForImplictArgs = false;
25182519
for (auto &Arg : F.args()) {
25192520
if (!InPreloadSequence || !Arg.hasInRegAttr())
25202521
break;
25212522

2522-
int ArgIdx = Arg.getArgNo();
2523+
unsigned ArgIdx = Arg.getArgNo();
25232524
// Don't preload non-original args or parts not in the current preload
25242525
// sequence.
2525-
if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2526-
(int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2526+
if (InIdx < Ins.size() &&
2527+
(!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
25272528
break;
25282529

25292530
for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2530-
(int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2531+
Ins[InIdx].getOrigArgIndex() == ArgIdx;
25312532
InIdx++) {
25322533
assert(ArgLocs[ArgIdx].isMemLoc());
25332534
auto &ArgLoc = ArgLocs[InIdx];
@@ -2537,6 +2538,23 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
25372538
unsigned NumAllocSGPRs =
25382539
alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
25392540

2541+
// Add padding SPGR to fix alignment for hidden arguments.
2542+
if (!AlignedForImplictArgs &&
2543+
F.getAttributes().hasAttributeAtIndex(AttributeList::FirstArgIndex +
2544+
Arg.getArgNo(),
2545+
"amdgpu-hidden-argument")) {
2546+
unsigned OffsetBefore = LastExplicitArgOffset;
2547+
LastExplicitArgOffset = alignTo(
2548+
LastExplicitArgOffset, Subtarget->getAlignmentForImplicitArgPtr());
2549+
if (OffsetBefore != LastExplicitArgOffset) {
2550+
unsigned PaddingSGPRs =
2551+
alignTo(LastExplicitArgOffset - OffsetBefore, 4) / 4;
2552+
Info.allocateUserSGPRs(PaddingSGPRs);
2553+
ArgOffset += PaddingSGPRs * 4;
2554+
}
2555+
AlignedForImplictArgs = true;
2556+
}
2557+
25402558
// Arg is preloaded into the previous SGPR.
25412559
if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
25422560
Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

+8
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,14 @@ SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
277277
return &ArgInfo.PreloadKernArgs[KernArgIdx].Regs;
278278
}
279279

280+
bool SIMachineFunctionInfo::allocateUserSGPRs(unsigned Number) {
281+
if (Number <= getNumUserSGPRs())
282+
return false;
283+
284+
NumUserSGPRs = Number;
285+
return true;
286+
}
287+
280288
void SIMachineFunctionInfo::allocateWWMSpill(MachineFunction &MF, Register VGPR,
281289
uint64_t Size, Align Alignment) {
282290
// Skip if it is an entry function or the register is already added.

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

+3
Original file line numberDiff line numberDiff line change
@@ -780,6 +780,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
780780
unsigned AllocSizeDWord, int KernArgIdx,
781781
int PaddingSGPRs);
782782

783+
/// Reserve up to \p Number of user SGPRs.
784+
bool allocateUserSGPRs(unsigned Number);
785+
783786
/// Increment user SGPRs used for padding the argument list only.
784787
Register addReservedUserSGPR() {
785788
Register Next = getNextUserSGPR();

0 commit comments

Comments
 (0)