Skip to content

Commit e2f0d77

Browse files
committed
[AMDGPU] Support preloading hidden kernel arguments
Adds hidden kernel arguments to the function signature and marks them inreg if they should be preloaded into user SGPRs. The normal kernarg preloading logic then takes over with some additional checks for the correct implicitarg_ptr alignment. Special care is needed so that metadata for the hidden arguments is not added twice when generating the code object.
1 parent 90617e9 commit e2f0d77

8 files changed

+1061
-7
lines changed

llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp

+7-1
Original file line numberDiff line numberDiff line change
@@ -260,8 +260,14 @@ void MetadataStreamerMsgPackV4::emitKernelArgs(const MachineFunction &MF,
260260
auto &Func = MF.getFunction();
261261
unsigned Offset = 0;
262262
auto Args = HSAMetadataDoc->getArrayNode();
263-
for (auto &Arg : Func.args())
263+
for (auto &Arg : Func.args()) {
264+
if (Func.getAttributes().hasAttributeAtIndex(AttributeList::FirstArgIndex +
265+
Arg.getArgNo(),
266+
"amdgpu-hidden-argument"))
267+
continue;
268+
264269
emitKernelArg(Arg, Offset, Args);
270+
}
265271

266272
emitHiddenKernelArgs(MF, Offset, Args);
267273

llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp

+197-2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313

1414
#include "AMDGPU.h"
1515
#include "GCNSubtarget.h"
16+
#include "llvm/ADT/StringExtras.h"
17+
#include "llvm/Analysis/ValueTracking.h"
1618
#include "llvm/CodeGen/TargetPassConfig.h"
1719
#include "llvm/IR/IRBuilder.h"
1820
#include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -31,9 +33,109 @@ class PreloadKernelArgInfo {
3133
const GCNSubtarget &ST;
3234
unsigned NumFreeUserSGPRs;
3335

34-
public:
35-
SmallVector<llvm::Metadata *, 8> KernelArgMetadata;
36+
enum HiddenArg : unsigned {
37+
HIDDEN_BLOCK_COUNT_X,
38+
HIDDEN_BLOCK_COUNT_Y,
39+
HIDDEN_BLOCK_COUNT_Z,
40+
HIDDEN_GROUP_SIZE_X,
41+
HIDDEN_GROUP_SIZE_Y,
42+
HIDDEN_GROUP_SIZE_Z,
43+
HIDDEN_REMAINDER_X,
44+
HIDDEN_REMAINDER_Y,
45+
HIDDEN_REMAINDER_Z,
46+
END_HIDDEN_ARGS
47+
};
48+
49+
// Stores information about a specific hidden argument.
50+
struct HiddenArgInfo {
51+
// Offset in bytes from the location in the kernearg segment pointed to by
52+
// the implicitarg pointer.
53+
uint8_t Offset;
54+
// The size of the hidden argument in bytes.
55+
uint8_t Size;
56+
// The name of the hidden argument in the kernel signature.
57+
const char *Name;
58+
};
59+
60+
static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
61+
{0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},
62+
{8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},
63+
{14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},
64+
{18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"},
65+
{22, 2, "_hidden_remainder_z"}};
66+
67+
static HiddenArg getHiddenArgFromOffset(unsigned Offset) {
68+
for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
69+
if (HiddenArgs[I].Offset == Offset)
70+
return static_cast<HiddenArg>(I);
71+
72+
return END_HIDDEN_ARGS;
73+
}
74+
75+
static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {
76+
if (HA < END_HIDDEN_ARGS)
77+
return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8);
78+
79+
llvm_unreachable("Unexpected hidden argument.");
80+
}
81+
82+
static const char *getHiddenArgName(HiddenArg HA) {
83+
if (HA < END_HIDDEN_ARGS) {
84+
return HiddenArgs[HA].Name;
85+
}
86+
llvm_unreachable("Unexpected hidden argument.");
87+
}
88+
89+
// Clones the function after adding implicit arguments to the argument list
90+
// and returns the new updated function. Preloaded implicit arguments are
91+
// added up to and including the last one that will be preloaded, indicated by
92+
// LastPreloadIndex. Currently preloading is only performed on the totality of
93+
// sequential data from the kernarg segment including implicit (hidden)
94+
// arguments. This means that all arguments up to the last preloaded argument
95+
// will also be preloaded even if that data is unused.
96+
Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) {
97+
FunctionType *FT = F.getFunctionType();
98+
LLVMContext &Ctx = F.getParent()->getContext();
99+
SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end());
100+
for (unsigned I = 0; I <= LastPreloadIndex; ++I)
101+
FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I)));
102+
103+
FunctionType *NFT =
104+
FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());
105+
Function *NF =
106+
Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName());
107+
108+
NF->copyAttributesFrom(&F);
109+
NF->copyMetadata(&F, 0);
110+
NF->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat);
111+
112+
F.getParent()->getFunctionList().insert(F.getIterator(), NF);
113+
NF->takeName(&F);
114+
NF->splice(NF->begin(), &F);
115+
116+
Function::arg_iterator NFArg = NF->arg_begin();
117+
for (Argument &Arg : F.args()) {
118+
Arg.replaceAllUsesWith(&*NFArg);
119+
NFArg->takeName(&Arg);
120+
++NFArg;
121+
}
122+
123+
AttrBuilder AB(Ctx);
124+
AB.addAttribute(Attribute::InReg);
125+
AB.addAttribute("amdgpu-hidden-argument");
126+
AttributeList AL = NF->getAttributes();
127+
for (unsigned I = 0; I <= LastPreloadIndex; ++I) {
128+
AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB);
129+
NFArg++->setName(getHiddenArgName(HiddenArg(I)));
130+
}
131+
132+
NF->setAttributes(AL);
133+
F.replaceAllUsesWith(NF);
134+
135+
return NF;
136+
}
36137

138+
public:
37139
PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
38140
setInitialFreeUserSGPRsCount();
39141
}
@@ -64,6 +166,91 @@ class PreloadKernelArgInfo {
64166
NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
65167
return true;
66168
}
169+
170+
// Try to allocate SGPRs to preload implicit kernel arguments.
171+
void tryAllocImplicitArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
172+
IRBuilder<> &Builder) {
173+
StringRef Name = Intrinsic::getName(Intrinsic::amdgcn_implicitarg_ptr);
174+
Function *ImplicitArgPtr = F.getParent()->getFunction(Name);
175+
if (!ImplicitArgPtr)
176+
return;
177+
178+
const DataLayout &DL = F.getParent()->getDataLayout();
179+
// Pair is the load and the load offset.
180+
SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads;
181+
for (auto *U : ImplicitArgPtr->users()) {
182+
Instruction *CI = dyn_cast<Instruction>(U);
183+
if (!CI || CI->getParent()->getParent() != &F)
184+
continue;
185+
186+
for (auto *U : CI->users()) {
187+
int64_t Offset = 0;
188+
auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
189+
if (!Load) {
190+
if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
191+
continue;
192+
193+
Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
194+
}
195+
196+
if (!Load || !Load->isSimple())
197+
continue;
198+
199+
// FIXME: Expand to handle 64-bit implicit args and large merged loads.
200+
LLVMContext &Ctx = F.getParent()->getContext();
201+
Type *LoadTy = Load->getType();
202+
HiddenArg HA = getHiddenArgFromOffset(Offset);
203+
if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))
204+
continue;
205+
206+
ImplicitArgLoads.push_back(std::make_pair(Load, Offset));
207+
}
208+
}
209+
210+
if (ImplicitArgLoads.empty())
211+
return;
212+
213+
// Allocate loads in order of offset. We need to be sure that the implicit
214+
// argument can actually be preloaded.
215+
std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(),
216+
[](const std::pair<LoadInst *, unsigned> &A,
217+
const std::pair<LoadInst *, unsigned> &B) {
218+
return A.second < B.second;
219+
});
220+
221+
uint64_t LastExplicitArgOffset = ImplicitArgsBaseOffset;
222+
// If we fail to preload any implicit argument we know we don't have SGPRs
223+
// to preload any subsequent ones with larger offsets. Find the first
224+
// argument that we cannot preload.
225+
auto *PreloadEnd = std::find_if(
226+
ImplicitArgLoads.begin(), ImplicitArgLoads.end(),
227+
[&](const std::pair<LoadInst *, unsigned> &Load) {
228+
unsigned LoadSize = DL.getTypeStoreSize(Load.first->getType());
229+
unsigned LoadOffset = Load.second;
230+
if (!tryAllocPreloadSGPRs(LoadSize,
231+
LoadOffset + ImplicitArgsBaseOffset,
232+
LastExplicitArgOffset))
233+
return true;
234+
235+
LastExplicitArgOffset = LoadOffset + LoadSize;
236+
return false;
237+
});
238+
239+
if (PreloadEnd == ImplicitArgLoads.begin())
240+
return;
241+
242+
unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second);
243+
Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex);
244+
assert(NF);
245+
for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) {
246+
LoadInst *LoadInst = I->first;
247+
unsigned LoadOffset = I->second;
248+
unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset);
249+
unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1;
250+
Argument *Arg = NF->getArg(Index);
251+
LoadInst->replaceAllUsesWith(Arg);
252+
}
253+
}
67254
};
68255

69256
class AMDGPULowerKernelArguments : public FunctionPass {
@@ -281,6 +468,14 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
281468
KernArgSegment->addRetAttr(
282469
Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
283470

471+
if (InPreloadSequence) {
472+
uint64_t ImplicitArgsBaseOffset =
473+
alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +
474+
BaseOffset;
475+
PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset,
476+
Builder);
477+
}
478+
284479
return true;
285480
}
286481

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

+5
Original file line numberDiff line numberDiff line change
@@ -611,6 +611,11 @@ uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
611611
MaxAlign = Align(1);
612612

613613
for (const Argument &Arg : F.args()) {
614+
if (F.getAttributes().hasAttributeAtIndex(AttributeList::FirstArgIndex +
615+
Arg.getArgNo(),
616+
"amdgpu-hidden-argument"))
617+
continue;
618+
614619
const bool IsByRef = Arg.hasByRefAttr();
615620
Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
616621
Align Alignment = DL.getValueOrABITypeAlignment(

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

+22-4
Original file line numberDiff line numberDiff line change
@@ -2511,19 +2511,20 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
25112511
GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
25122512
bool InPreloadSequence = true;
25132513
unsigned InIdx = 0;
2514+
bool AlignedForImplictArgs = false;
25142515
for (auto &Arg : F.args()) {
25152516
if (!InPreloadSequence || !Arg.hasInRegAttr())
25162517
break;
25172518

2518-
int ArgIdx = Arg.getArgNo();
2519+
unsigned ArgIdx = Arg.getArgNo();
25192520
// Don't preload non-original args or parts not in the current preload
25202521
// sequence.
2521-
if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2522-
(int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2522+
if (InIdx < Ins.size() &&
2523+
(!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
25232524
break;
25242525

25252526
for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2526-
(int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2527+
Ins[InIdx].getOrigArgIndex() == ArgIdx;
25272528
InIdx++) {
25282529
assert(ArgLocs[ArgIdx].isMemLoc());
25292530
auto &ArgLoc = ArgLocs[InIdx];
@@ -2533,6 +2534,23 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
25332534
unsigned NumAllocSGPRs =
25342535
alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
25352536

2537+
// Add padding SPGR to fix alignment for hidden arguments.
2538+
if (!AlignedForImplictArgs &&
2539+
F.getAttributes().hasAttributeAtIndex(AttributeList::FirstArgIndex +
2540+
Arg.getArgNo(),
2541+
"amdgpu-hidden-argument")) {
2542+
unsigned OffsetBefore = LastExplicitArgOffset;
2543+
LastExplicitArgOffset = alignTo(
2544+
LastExplicitArgOffset, Subtarget->getAlignmentForImplicitArgPtr());
2545+
if (OffsetBefore != LastExplicitArgOffset) {
2546+
unsigned PaddingSGPRs =
2547+
alignTo(LastExplicitArgOffset - OffsetBefore, 4) / 4;
2548+
Info.allocateUserSGPRs(PaddingSGPRs);
2549+
ArgOffset += PaddingSGPRs * 4;
2550+
}
2551+
AlignedForImplictArgs = true;
2552+
}
2553+
25362554
// Arg is preloaded into the previous SGPR.
25372555
if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
25382556
Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

+8
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,14 @@ SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
278278
return &ArgInfo.PreloadKernArgs[KernArgIdx].Regs;
279279
}
280280

281+
bool SIMachineFunctionInfo::allocateUserSGPRs(unsigned Number) {
282+
if (Number <= getNumUserSGPRs())
283+
return false;
284+
285+
NumUserSGPRs = Number;
286+
return true;
287+
}
288+
281289
void SIMachineFunctionInfo::allocateWWMSpill(MachineFunction &MF, Register VGPR,
282290
uint64_t Size, Align Alignment) {
283291
// Skip if it is an entry function or the register is already added.

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

+3
Original file line numberDiff line numberDiff line change
@@ -760,6 +760,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
760760
unsigned AllocSizeDWord, int KernArgIdx,
761761
int PaddingSGPRs);
762762

763+
/// Reserve up to \p Number of user SGPRs.
764+
bool allocateUserSGPRs(unsigned Number);
765+
763766
/// Increment user SGPRs used for padding the argument list only.
764767
Register addReservedUserSGPR() {
765768
Register Next = getNextUserSGPR();

0 commit comments

Comments
 (0)