Skip to content

Commit 01d38d0

Browse files
committed
[AMDGPU] Support preloading hidden kernel arguments
Adds hidden kernel arguments to the function signature and marks them inreg if they should be preloaded into user SGPRs. The normal kernarg preloading logic then takes over with some additional checks for the correct implicitarg_ptr alignment. Special care is needed so that metadata for the hidden arguments is not added twice when generating the code object.
1 parent 90617e9 commit 01d38d0

8 files changed

+2648
-1380
lines changed

llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp

+7-1
Original file line numberDiff line numberDiff line change
@@ -260,8 +260,14 @@ void MetadataStreamerMsgPackV4::emitKernelArgs(const MachineFunction &MF,
260260
auto &Func = MF.getFunction();
261261
unsigned Offset = 0;
262262
auto Args = HSAMetadataDoc->getArrayNode();
263-
for (auto &Arg : Func.args())
263+
for (auto &Arg : Func.args()) {
264+
if (Func.getAttributes().hasAttributeAtIndex(AttributeList::FirstArgIndex +
265+
Arg.getArgNo(),
266+
"amdgpu-hidden-argument"))
267+
continue;
268+
264269
emitKernelArg(Arg, Offset, Args);
270+
}
265271

266272
emitHiddenKernelArgs(MF, Offset, Args);
267273

llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp

+183-2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313

1414
#include "AMDGPU.h"
1515
#include "GCNSubtarget.h"
16+
#include "llvm/ADT/StringExtras.h"
17+
#include "llvm/Analysis/ValueTracking.h"
1618
#include "llvm/CodeGen/TargetPassConfig.h"
1719
#include "llvm/IR/IRBuilder.h"
1820
#include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -31,9 +33,97 @@ class PreloadKernelArgInfo {
3133
const GCNSubtarget &ST;
3234
unsigned NumFreeUserSGPRs;
3335

34-
public:
35-
SmallVector<llvm::Metadata *, 8> KernelArgMetadata;
36+
enum HiddenArg : unsigned {
37+
HIDDEN_BLOCK_COUNT_X,
38+
HIDDEN_BLOCK_COUNT_Y,
39+
HIDDEN_BLOCK_COUNT_Z,
40+
HIDDEN_GROUP_SIZE_X,
41+
HIDDEN_GROUP_SIZE_Y,
42+
HIDDEN_GROUP_SIZE_Z,
43+
HIDDEN_REMAINDER_X,
44+
HIDDEN_REMAINDER_Y,
45+
HIDDEN_REMAINDER_Z,
46+
END_HIDDEN_ARGS
47+
};
48+
49+
struct HiddenArgInfo {
50+
uint8_t Offset;
51+
uint8_t Size;
52+
const char *Name;
53+
};
54+
55+
static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
56+
{0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},
57+
{8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},
58+
{14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},
59+
{18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"},
60+
{22, 2, "_hidden_remainder_z"}};
61+
62+
static HiddenArg getHiddenArgIndexFromOffset(unsigned Offset) {
63+
for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
64+
if (HiddenArgs[I].Offset == Offset)
65+
return static_cast<HiddenArg>(I);
66+
67+
llvm_unreachable("Unexpected hidden argument offset.");
68+
}
69+
70+
static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {
71+
if (HA < END_HIDDEN_ARGS)
72+
return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8);
73+
74+
llvm_unreachable("Unexpected hidden argument.");
75+
}
76+
77+
static const char *getHiddenArgName(HiddenArg HA) {
78+
if (HA < END_HIDDEN_ARGS) {
79+
return HiddenArgs[HA].Name;
80+
}
81+
llvm_unreachable("Unexpected hidden argument.");
82+
}
83+
84+
Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) {
85+
FunctionType *FT = F.getFunctionType();
86+
LLVMContext &Ctx = F.getParent()->getContext();
87+
SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end());
88+
for (unsigned I = 0; I <= LastPreloadIndex; ++I)
89+
FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I)));
90+
91+
FunctionType *NFT =
92+
FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());
93+
Function *NF =
94+
Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName());
95+
96+
NF->copyAttributesFrom(&F);
97+
NF->copyMetadata(&F, 0);
98+
NF->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat);
99+
100+
F.getParent()->getFunctionList().insert(F.getIterator(), NF);
101+
NF->takeName(&F);
102+
NF->splice(NF->begin(), &F);
103+
104+
Function::arg_iterator NFArg = NF->arg_begin();
105+
for (Argument &Arg : F.args()) {
106+
Arg.replaceAllUsesWith(&*NFArg);
107+
NFArg->takeName(&Arg);
108+
++NFArg;
109+
}
110+
111+
AttrBuilder AB(Ctx);
112+
AB.addAttribute(Attribute::InReg);
113+
AB.addAttribute("amdgpu-hidden-argument");
114+
AttributeList AL = NF->getAttributes();
115+
for (unsigned I = 0; I <= LastPreloadIndex; ++I) {
116+
AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB);
117+
NFArg++->setName(getHiddenArgName(HiddenArg(I)));
118+
}
119+
120+
NF->setAttributes(AL);
121+
F.replaceAllUsesWith(NF);
122+
123+
return NF;
124+
}
36125

126+
public:
37127
PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
38128
setInitialFreeUserSGPRsCount();
39129
}
@@ -64,6 +154,89 @@ class PreloadKernelArgInfo {
64154
NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
65155
return true;
66156
}
157+
158+
// Try to allocate SGPRs to preload implicit kernel arguments.
159+
void tryAllocImplicitArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
160+
IRBuilder<> &Builder) {
161+
StringRef Name = Intrinsic::getName(Intrinsic::amdgcn_implicitarg_ptr);
162+
Function *ImplicitArgPtr = F.getParent()->getFunction(Name);
163+
if (!ImplicitArgPtr)
164+
return;
165+
166+
const DataLayout &DL = F.getParent()->getDataLayout();
167+
// Pair is the load and the load offset.
168+
SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads;
169+
for (auto *U : ImplicitArgPtr->users()) {
170+
Instruction *CI = dyn_cast<Instruction>(U);
171+
if (!CI || CI->getParent()->getParent() != &F)
172+
continue;
173+
174+
for (auto *U : CI->users()) {
175+
int64_t Offset = 0;
176+
auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
177+
if (!Load) {
178+
if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
179+
continue;
180+
181+
Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
182+
}
183+
184+
if (!Load || !Load->isSimple())
185+
continue;
186+
187+
// FIXME: Expand to handle 64-bit implicit args and large merged loads.
188+
unsigned LoadSize = Load->getType()->getScalarSizeInBits();
189+
if (LoadSize != 32 && LoadSize != 16)
190+
continue;
191+
192+
ImplicitArgLoads.push_back(std::make_pair(Load, Offset));
193+
}
194+
}
195+
196+
if (ImplicitArgLoads.empty())
197+
return;
198+
199+
// Allocate loads in order of offset. We need to be sure that the implicit
200+
// argument can actually be preloaded.
201+
std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(),
202+
[](const std::pair<LoadInst *, unsigned> &A,
203+
const std::pair<LoadInst *, unsigned> &B) {
204+
return A.second < B.second;
205+
});
206+
207+
uint64_t LastExplicitArgOffset = ImplicitArgsBaseOffset;
208+
// If we fail to preload any implicit argument we know we don't have SGPRs
209+
// to preload any subsequent ones with larger offsets. Find the first
210+
// argument that we cannot preload.
211+
auto *PreloadEnd = std::find_if(
212+
ImplicitArgLoads.begin(), ImplicitArgLoads.end(),
213+
[&](const std::pair<LoadInst *, unsigned> &Load) {
214+
unsigned LoadSize = DL.getTypeStoreSize(Load.first->getType());
215+
unsigned LoadOffset = Load.second;
216+
if (!tryAllocPreloadSGPRs(LoadSize,
217+
LoadOffset + ImplicitArgsBaseOffset,
218+
LastExplicitArgOffset))
219+
return true;
220+
221+
LastExplicitArgOffset = LoadOffset + LoadSize;
222+
return false;
223+
});
224+
225+
if (PreloadEnd == ImplicitArgLoads.begin())
226+
return;
227+
228+
unsigned LastHiddenArgIndex = getHiddenArgIndexFromOffset(PreloadEnd[-1].second);
229+
Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex);
230+
assert(NF);
231+
for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) {
232+
LoadInst *LoadInst = I->first;
233+
unsigned LoadOffset = I->second;
234+
unsigned HiddenArgIndex = getHiddenArgIndexFromOffset(LoadOffset);
235+
unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1;
236+
Argument *Arg = NF->getArg(Index);
237+
LoadInst->replaceAllUsesWith(Arg);
238+
}
239+
}
67240
};
68241

69242
class AMDGPULowerKernelArguments : public FunctionPass {
@@ -281,6 +454,14 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
281454
KernArgSegment->addRetAttr(
282455
Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
283456

457+
if (InPreloadSequence) {
458+
uint64_t ImplicitArgsBaseOffset =
459+
alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +
460+
BaseOffset;
461+
PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset,
462+
Builder);
463+
}
464+
284465
return true;
285466
}
286467

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

+5
Original file line numberDiff line numberDiff line change
@@ -611,6 +611,11 @@ uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
611611
MaxAlign = Align(1);
612612

613613
for (const Argument &Arg : F.args()) {
614+
if (F.getAttributes().hasAttributeAtIndex(AttributeList::FirstArgIndex +
615+
Arg.getArgNo(),
616+
"amdgpu-hidden-argument"))
617+
continue;
618+
614619
const bool IsByRef = Arg.hasByRefAttr();
615620
Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
616621
Align Alignment = DL.getValueOrABITypeAlignment(

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

+22-4
Original file line numberDiff line numberDiff line change
@@ -2511,19 +2511,20 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
25112511
GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
25122512
bool InPreloadSequence = true;
25132513
unsigned InIdx = 0;
2514+
bool AlignedForImplictArgs = false;
25142515
for (auto &Arg : F.args()) {
25152516
if (!InPreloadSequence || !Arg.hasInRegAttr())
25162517
break;
25172518

2518-
int ArgIdx = Arg.getArgNo();
2519+
unsigned ArgIdx = Arg.getArgNo();
25192520
// Don't preload non-original args or parts not in the current preload
25202521
// sequence.
2521-
if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2522-
(int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2522+
if (InIdx < Ins.size() &&
2523+
(!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
25232524
break;
25242525

25252526
for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2526-
(int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2527+
Ins[InIdx].getOrigArgIndex() == ArgIdx;
25272528
InIdx++) {
25282529
assert(ArgLocs[ArgIdx].isMemLoc());
25292530
auto &ArgLoc = ArgLocs[InIdx];
@@ -2533,6 +2534,23 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
25332534
unsigned NumAllocSGPRs =
25342535
alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
25352536

2537+
// Add padding SPGR to fix alignment for hidden arguments.
2538+
if (!AlignedForImplictArgs &&
2539+
F.getAttributes().hasAttributeAtIndex(AttributeList::FirstArgIndex +
2540+
Arg.getArgNo(),
2541+
"amdgpu-hidden-argument")) {
2542+
unsigned OffsetBefore = LastExplicitArgOffset;
2543+
LastExplicitArgOffset = alignTo(
2544+
LastExplicitArgOffset, Subtarget->getAlignmentForImplicitArgPtr());
2545+
if (OffsetBefore != LastExplicitArgOffset) {
2546+
unsigned PaddingSGPRs =
2547+
alignTo(LastExplicitArgOffset - OffsetBefore, 4) / 4;
2548+
Info.allocateUserSGPRs(PaddingSGPRs);
2549+
ArgOffset += PaddingSGPRs * 4;
2550+
}
2551+
AlignedForImplictArgs = true;
2552+
}
2553+
25362554
// Arg is preloaded into the previous SGPR.
25372555
if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
25382556
Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

+9
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,15 @@ SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
278278
return &ArgInfo.PreloadKernArgs[KernArgIdx].Regs;
279279
}
280280

281+
bool SIMachineFunctionInfo::allocateUserSGPRs(
282+
unsigned Number) {
283+
if (Number <= getNumUserSGPRs())
284+
return false;
285+
286+
NumUserSGPRs = Number;
287+
return true;
288+
}
289+
281290
void SIMachineFunctionInfo::allocateWWMSpill(MachineFunction &MF, Register VGPR,
282291
uint64_t Size, Align Alignment) {
283292
// Skip if it is an entry function or the register is already added.

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

+3
Original file line numberDiff line numberDiff line change
@@ -760,6 +760,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
760760
unsigned AllocSizeDWord, int KernArgIdx,
761761
int PaddingSGPRs);
762762

763+
/// Reserve up to \p Number of user SGPRs.
764+
bool allocateUserSGPRs(unsigned Number);
765+
763766
/// Increment user SGPRs used for padding the argument list only.
764767
Register addReservedUserSGPR() {
765768
Register Next = getNextUserSGPR();

0 commit comments

Comments
 (0)