Skip to content

Commit 2e0ce8a

Browse files
committed
[AMDGPU] Move kernarg preload logic to separate pass
Moves kernarg preload logic to its own module pass. Cloned function declarations are removed when preloading hidden arguments. The inreg attribute is now added in this pass instead of AMDGPUAttributor. The rest of the logic is copied from AMDGPULowerKernelArguments which now only check whether an arguments is marked inreg to avoid replacing direct uses of preloaded arguments. This change requires test updates to remove inreg from lit tests with kernels that don't actually want preloading.
1 parent c617466 commit 2e0ce8a

20 files changed

+449
-639
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ ModulePass *createAMDGPULowerBufferFatPointersPass();
6565
FunctionPass *createSIModeRegisterPass();
6666
FunctionPass *createGCNPreRAOptimizationsLegacyPass();
6767
FunctionPass *createAMDGPUPreloadKernArgPrologLegacyPass();
68+
ModulePass *createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *);
6869

6970
struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> {
7071
AMDGPUSimplifyLibCallsPass() {}
@@ -231,6 +232,9 @@ extern char &GCNRegPressurePrinterID;
231232
void initializeAMDGPUPreloadKernArgPrologLegacyPass(PassRegistry &);
232233
extern char &AMDGPUPreloadKernArgPrologLegacyID;
233234

235+
void initializeAMDGPUPreloadKernelArgumentsLegacyPass(PassRegistry &);
236+
extern char &AMDGPUPreloadKernelArgumentsLegacyID;
237+
234238
// Passes common to R600 and SI
235239
FunctionPass *createAMDGPUPromoteAlloca();
236240
void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
@@ -342,6 +346,17 @@ class AMDGPUAttributorPass : public PassInfoMixin<AMDGPUAttributorPass> {
342346
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
343347
};
344348

349+
class AMDGPUPreloadKernelArgumentsPass
350+
: public PassInfoMixin<AMDGPUPreloadKernelArgumentsPass> {
351+
const AMDGPUTargetMachine &TM;
352+
353+
public:
354+
explicit AMDGPUPreloadKernelArgumentsPass(const AMDGPUTargetMachine &TM)
355+
: TM(TM) {}
356+
357+
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
358+
};
359+
345360
class AMDGPUAnnotateUniformValuesPass
346361
: public PassInfoMixin<AMDGPUAnnotateUniformValuesPass> {
347362
public:

llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,6 @@
2525

2626
using namespace llvm;
2727

28-
static cl::opt<unsigned> KernargPreloadCount(
29-
"amdgpu-kernarg-preload-count",
30-
cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0));
31-
3228
static cl::opt<unsigned> IndirectCallSpecializationThreshold(
3329
"amdgpu-indirect-call-specialization-threshold",
3430
cl::desc(
@@ -1327,21 +1323,6 @@ struct AAAMDGPUNoAGPR
13271323

13281324
const char AAAMDGPUNoAGPR::ID = 0;
13291325

1330-
static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
1331-
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
1332-
for (unsigned I = 0;
1333-
I < F.arg_size() &&
1334-
I < std::min(KernargPreloadCount.getValue(), ST.getMaxNumUserSGPRs());
1335-
++I) {
1336-
Argument &Arg = *F.getArg(I);
1337-
// Check for incompatible attributes.
1338-
if (Arg.hasByRefAttr() || Arg.hasNestAttr())
1339-
break;
1340-
1341-
Arg.addAttr(Attribute::InReg);
1342-
}
1343-
}
1344-
13451326
static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
13461327
AMDGPUAttributorOptions Options) {
13471328
SetVector<Function *> Functions;
@@ -1391,8 +1372,6 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
13911372
if (!AMDGPU::isEntryFunctionCC(CC)) {
13921373
A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(*F));
13931374
A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(*F));
1394-
} else if (CC == CallingConv::AMDGPU_KERNEL) {
1395-
addPreloadKernArgHint(*F, TM);
13961375
}
13971376

13981377
for (auto &I : instructions(F)) {

llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp

Lines changed: 2 additions & 254 deletions
Original file line numberDiff line numberDiff line change
@@ -27,231 +27,6 @@ using namespace llvm;
2727

2828
namespace {
2929

30-
class PreloadKernelArgInfo {
31-
private:
32-
Function &F;
33-
const GCNSubtarget &ST;
34-
unsigned NumFreeUserSGPRs;
35-
36-
enum HiddenArg : unsigned {
37-
HIDDEN_BLOCK_COUNT_X,
38-
HIDDEN_BLOCK_COUNT_Y,
39-
HIDDEN_BLOCK_COUNT_Z,
40-
HIDDEN_GROUP_SIZE_X,
41-
HIDDEN_GROUP_SIZE_Y,
42-
HIDDEN_GROUP_SIZE_Z,
43-
HIDDEN_REMAINDER_X,
44-
HIDDEN_REMAINDER_Y,
45-
HIDDEN_REMAINDER_Z,
46-
END_HIDDEN_ARGS
47-
};
48-
49-
// Stores information about a specific hidden argument.
50-
struct HiddenArgInfo {
51-
// Offset in bytes from the location in the kernearg segment pointed to by
52-
// the implicitarg pointer.
53-
uint8_t Offset;
54-
// The size of the hidden argument in bytes.
55-
uint8_t Size;
56-
// The name of the hidden argument in the kernel signature.
57-
const char *Name;
58-
};
59-
60-
static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
61-
{0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},
62-
{8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},
63-
{14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},
64-
{18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"},
65-
{22, 2, "_hidden_remainder_z"}};
66-
67-
static HiddenArg getHiddenArgFromOffset(unsigned Offset) {
68-
for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
69-
if (HiddenArgs[I].Offset == Offset)
70-
return static_cast<HiddenArg>(I);
71-
72-
return END_HIDDEN_ARGS;
73-
}
74-
75-
static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {
76-
if (HA < END_HIDDEN_ARGS)
77-
return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8);
78-
79-
llvm_unreachable("Unexpected hidden argument.");
80-
}
81-
82-
static const char *getHiddenArgName(HiddenArg HA) {
83-
if (HA < END_HIDDEN_ARGS) {
84-
return HiddenArgs[HA].Name;
85-
}
86-
llvm_unreachable("Unexpected hidden argument.");
87-
}
88-
89-
// Clones the function after adding implicit arguments to the argument list
90-
// and returns the new updated function. Preloaded implicit arguments are
91-
// added up to and including the last one that will be preloaded, indicated by
92-
// LastPreloadIndex. Currently preloading is only performed on the totality of
93-
// sequential data from the kernarg segment including implicit (hidden)
94-
// arguments. This means that all arguments up to the last preloaded argument
95-
// will also be preloaded even if that data is unused.
96-
Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) {
97-
FunctionType *FT = F.getFunctionType();
98-
LLVMContext &Ctx = F.getParent()->getContext();
99-
SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end());
100-
for (unsigned I = 0; I <= LastPreloadIndex; ++I)
101-
FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I)));
102-
103-
FunctionType *NFT =
104-
FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());
105-
Function *NF =
106-
Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName());
107-
108-
NF->copyAttributesFrom(&F);
109-
NF->copyMetadata(&F, 0);
110-
NF->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat);
111-
112-
F.getParent()->getFunctionList().insert(F.getIterator(), NF);
113-
NF->takeName(&F);
114-
NF->splice(NF->begin(), &F);
115-
116-
Function::arg_iterator NFArg = NF->arg_begin();
117-
for (Argument &Arg : F.args()) {
118-
Arg.replaceAllUsesWith(&*NFArg);
119-
NFArg->takeName(&Arg);
120-
++NFArg;
121-
}
122-
123-
AttrBuilder AB(Ctx);
124-
AB.addAttribute(Attribute::InReg);
125-
AB.addAttribute("amdgpu-hidden-argument");
126-
AttributeList AL = NF->getAttributes();
127-
for (unsigned I = 0; I <= LastPreloadIndex; ++I) {
128-
AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB);
129-
NFArg++->setName(getHiddenArgName(HiddenArg(I)));
130-
}
131-
132-
NF->setAttributes(AL);
133-
F.replaceAllUsesWith(NF);
134-
F.setCallingConv(CallingConv::C);
135-
F.clearMetadata();
136-
137-
return NF;
138-
}
139-
140-
public:
141-
PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
142-
setInitialFreeUserSGPRsCount();
143-
}
144-
145-
// Returns the maximum number of user SGPRs that we have available to preload
146-
// arguments.
147-
void setInitialFreeUserSGPRsCount() {
148-
GCNUserSGPRUsageInfo UserSGPRInfo(F, ST);
149-
NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs();
150-
}
151-
152-
bool tryAllocPreloadSGPRs(unsigned AllocSize, uint64_t ArgOffset,
153-
uint64_t LastExplicitArgOffset) {
154-
// Check if this argument may be loaded into the same register as the
155-
// previous argument.
156-
if (ArgOffset - LastExplicitArgOffset < 4 &&
157-
!isAligned(Align(4), ArgOffset))
158-
return true;
159-
160-
// Pad SGPRs for kernarg alignment.
161-
ArgOffset = alignDown(ArgOffset, 4);
162-
unsigned Padding = ArgOffset - LastExplicitArgOffset;
163-
unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
164-
unsigned NumPreloadSGPRs = alignTo(AllocSize, 4) / 4;
165-
if (NumPreloadSGPRs + PaddingSGPRs > NumFreeUserSGPRs)
166-
return false;
167-
168-
NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
169-
return true;
170-
}
171-
172-
// Try to allocate SGPRs to preload implicit kernel arguments.
173-
void tryAllocImplicitArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
174-
uint64_t LastExplicitArgOffset,
175-
IRBuilder<> &Builder) {
176-
Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists(
177-
F.getParent(), Intrinsic::amdgcn_implicitarg_ptr);
178-
if (!ImplicitArgPtr)
179-
return;
180-
181-
const DataLayout &DL = F.getParent()->getDataLayout();
182-
// Pair is the load and the load offset.
183-
SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads;
184-
for (auto *U : ImplicitArgPtr->users()) {
185-
Instruction *CI = dyn_cast<Instruction>(U);
186-
if (!CI || CI->getParent()->getParent() != &F)
187-
continue;
188-
189-
for (auto *U : CI->users()) {
190-
int64_t Offset = 0;
191-
auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
192-
if (!Load) {
193-
if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
194-
continue;
195-
196-
Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
197-
}
198-
199-
if (!Load || !Load->isSimple())
200-
continue;
201-
202-
// FIXME: Expand to handle 64-bit implicit args and large merged loads.
203-
LLVMContext &Ctx = F.getParent()->getContext();
204-
Type *LoadTy = Load->getType();
205-
HiddenArg HA = getHiddenArgFromOffset(Offset);
206-
if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))
207-
continue;
208-
209-
ImplicitArgLoads.push_back(std::make_pair(Load, Offset));
210-
}
211-
}
212-
213-
if (ImplicitArgLoads.empty())
214-
return;
215-
216-
// Allocate loads in order of offset. We need to be sure that the implicit
217-
// argument can actually be preloaded.
218-
std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second());
219-
220-
// If we fail to preload any implicit argument we know we don't have SGPRs
221-
// to preload any subsequent ones with larger offsets. Find the first
222-
// argument that we cannot preload.
223-
auto *PreloadEnd = std::find_if(
224-
ImplicitArgLoads.begin(), ImplicitArgLoads.end(),
225-
[&](const std::pair<LoadInst *, unsigned> &Load) {
226-
unsigned LoadSize = DL.getTypeStoreSize(Load.first->getType());
227-
unsigned LoadOffset = Load.second;
228-
if (!tryAllocPreloadSGPRs(LoadSize,
229-
LoadOffset + ImplicitArgsBaseOffset,
230-
LastExplicitArgOffset))
231-
return true;
232-
233-
LastExplicitArgOffset =
234-
ImplicitArgsBaseOffset + LoadOffset + LoadSize;
235-
return false;
236-
});
237-
238-
if (PreloadEnd == ImplicitArgLoads.begin())
239-
return;
240-
241-
unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second);
242-
Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex);
243-
assert(NF);
244-
for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) {
245-
LoadInst *LoadInst = I->first;
246-
unsigned LoadOffset = I->second;
247-
unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset);
248-
unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1;
249-
Argument *Arg = NF->getArg(Index);
250-
LoadInst->replaceAllUsesWith(Arg);
251-
}
252-
}
253-
};
254-
25530
class AMDGPULowerKernelArguments : public FunctionPass {
25631
public:
25732
static char ID;
@@ -311,10 +86,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
31186
Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
31287

31388
uint64_t ExplicitArgOffset = 0;
314-
// Preloaded kernel arguments must be sequential.
315-
bool InPreloadSequence = true;
316-
PreloadKernelArgInfo PreloadInfo(F, ST);
317-
31889
for (Argument &Arg : F.args()) {
31990
const bool IsByRef = Arg.hasByRefAttr();
32091
Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
@@ -325,25 +96,10 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
32596
uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
32697

32798
uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset;
328-
uint64_t LastExplicitArgOffset = ExplicitArgOffset;
32999
ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
330100

331-
// Guard against the situation where hidden arguments have already been
332-
// lowered and added to the kernel function signiture, i.e. in a situation
333-
// where this pass has run twice.
334-
if (Arg.hasAttribute("amdgpu-hidden-argument"))
335-
break;
336-
337-
// Try to preload this argument into user SGPRs.
338-
if (Arg.hasInRegAttr() && InPreloadSequence && ST.hasKernargPreload() &&
339-
!Arg.getType()->isAggregateType())
340-
if (PreloadInfo.tryAllocPreloadSGPRs(AllocSize, EltOffset,
341-
LastExplicitArgOffset))
342-
continue;
343-
344-
InPreloadSequence = false;
345-
346-
if (Arg.use_empty())
101+
// Skip inreg arguments which should be preloaded.
102+
if (Arg.use_empty() || Arg.hasInRegAttr())
347103
continue;
348104

349105
// If this is byval, the loads are already explicit in the function. We just
@@ -483,14 +239,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
483239
KernArgSegment->addRetAttr(
484240
Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
485241

486-
if (InPreloadSequence) {
487-
uint64_t ImplicitArgsBaseOffset =
488-
alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +
489-
BaseOffset;
490-
PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset,
491-
ExplicitArgOffset, Builder);
492-
}
493-
494242
return true;
495243
}
496244

llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
2929
MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
3030
MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
3131
MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
32+
MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(*this))
3233
#undef MODULE_PASS
3334

3435
#ifndef MODULE_PASS_WITH_PARAMS

0 commit comments

Comments
 (0)