Skip to content

Commit 1af8cfb

Browse files
committed
[AMDGPU] Move kernarg preload logic to AMDGPU Attributor
Besides the changes listed below everything works the same as it did when this code was in AMDGPULowerKernelArguments. There is a refactoring of the free user SGPR tracking to make it simplified and more accurate. We don't actually care which SGPRs hold which arguments before ISel so specific tracking of the number of free registers is removed. In one case this leads to one extra argument being preloaded in a test. ISel correctly identifies this opportunity even when the IR pass previously missed it. Even though inreg is meant to act as a hint the coupling between the attribute and whether an argument is actually preloaded should be equivalent now, although ISel always makes the final determination. Since we are no longer handling this in AMDGPULowerKernelArguments that pass must rely on the inreg attribute to determine whether to leave arguments as is. This leads to some test changes. This lowering is moved out of the llc pipeline which requires test updates. Cloned function declarations are removed when kernel signatures are modified to preload hidden arguments.
1 parent 6d27ee0 commit 1af8cfb

10 files changed

+456
-1021
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp

Lines changed: 270 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@
1414
#include "GCNSubtarget.h"
1515
#include "Utils/AMDGPUBaseInfo.h"
1616
#include "llvm/Analysis/CycleAnalysis.h"
17+
#include "llvm/Analysis/ValueTracking.h"
1718
#include "llvm/CodeGen/TargetPassConfig.h"
19+
#include "llvm/IR/IRBuilder.h"
1820
#include "llvm/IR/IntrinsicsAMDGPU.h"
1921
#include "llvm/IR/IntrinsicsR600.h"
2022
#include "llvm/Target/TargetMachine.h"
@@ -144,6 +146,213 @@ static bool funcRequiresHostcallPtr(const Function &F) {
144146
}
145147

146148
namespace {
149+
150+
class PreloadKernelArgInfo {
151+
private:
152+
Function &F;
153+
const GCNSubtarget &ST;
154+
unsigned NumFreeUserSGPRs;
155+
156+
enum HiddenArg : unsigned {
157+
HIDDEN_BLOCK_COUNT_X,
158+
HIDDEN_BLOCK_COUNT_Y,
159+
HIDDEN_BLOCK_COUNT_Z,
160+
HIDDEN_GROUP_SIZE_X,
161+
HIDDEN_GROUP_SIZE_Y,
162+
HIDDEN_GROUP_SIZE_Z,
163+
HIDDEN_REMAINDER_X,
164+
HIDDEN_REMAINDER_Y,
165+
HIDDEN_REMAINDER_Z,
166+
END_HIDDEN_ARGS
167+
};
168+
169+
// Stores information about a specific hidden argument.
170+
struct HiddenArgInfo {
171+
// Offset in bytes from the location in the kernearg segment pointed to by
172+
// the implicitarg pointer.
173+
uint8_t Offset;
174+
// The size of the hidden argument in bytes.
175+
uint8_t Size;
176+
// The name of the hidden argument in the kernel signature.
177+
const char *Name;
178+
};
179+
180+
static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
181+
{0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},
182+
{8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},
183+
{14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},
184+
{18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"},
185+
{22, 2, "_hidden_remainder_z"}};
186+
187+
static HiddenArg getHiddenArgFromOffset(unsigned Offset) {
188+
for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
189+
if (HiddenArgs[I].Offset == Offset)
190+
return static_cast<HiddenArg>(I);
191+
192+
return END_HIDDEN_ARGS;
193+
}
194+
195+
static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {
196+
if (HA < END_HIDDEN_ARGS)
197+
return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8);
198+
199+
llvm_unreachable("Unexpected hidden argument.");
200+
}
201+
202+
static const char *getHiddenArgName(HiddenArg HA) {
203+
if (HA < END_HIDDEN_ARGS) {
204+
return HiddenArgs[HA].Name;
205+
}
206+
llvm_unreachable("Unexpected hidden argument.");
207+
}
208+
209+
// Clones the function after adding implicit arguments to the argument list
210+
// and returns the new updated function. Preloaded implicit arguments are
211+
// added up to and including the last one that will be preloaded, indicated by
212+
// LastPreloadIndex. Currently preloading is only performed on the totality of
213+
// sequential data from the kernarg segment including implicit (hidden)
214+
// arguments. This means that all arguments up to the last preloaded argument
215+
// will also be preloaded even if that data is unused.
216+
Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) {
217+
FunctionType *FT = F.getFunctionType();
218+
LLVMContext &Ctx = F.getParent()->getContext();
219+
SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end());
220+
for (unsigned I = 0; I <= LastPreloadIndex; ++I)
221+
FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I)));
222+
223+
FunctionType *NFT =
224+
FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());
225+
Function *NF =
226+
Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName());
227+
228+
NF->copyAttributesFrom(&F);
229+
NF->copyMetadata(&F, 0);
230+
NF->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat);
231+
232+
F.getParent()->getFunctionList().insert(F.getIterator(), NF);
233+
NF->takeName(&F);
234+
NF->splice(NF->begin(), &F);
235+
236+
Function::arg_iterator NFArg = NF->arg_begin();
237+
for (Argument &Arg : F.args()) {
238+
Arg.replaceAllUsesWith(&*NFArg);
239+
NFArg->takeName(&Arg);
240+
++NFArg;
241+
}
242+
243+
AttrBuilder AB(Ctx);
244+
AB.addAttribute(Attribute::InReg);
245+
AB.addAttribute("amdgpu-hidden-argument");
246+
AttributeList AL = NF->getAttributes();
247+
for (unsigned I = 0; I <= LastPreloadIndex; ++I) {
248+
AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB);
249+
NFArg++->setName(getHiddenArgName(HiddenArg(I)));
250+
}
251+
252+
NF->setAttributes(AL);
253+
F.replaceAllUsesWith(NF);
254+
255+
return NF;
256+
}
257+
258+
public:
259+
PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
260+
setInitialFreeUserSGPRsCount();
261+
}
262+
263+
// Returns the maximum number of user SGPRs that we have available to preload
264+
// arguments.
265+
void setInitialFreeUserSGPRsCount() {
266+
GCNUserSGPRUsageInfo UserSGPRInfo(F, ST);
267+
NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs();
268+
}
269+
270+
bool canPreloadKernArgAtOffset(uint64_t ExplicitArgOffset) {
271+
return ExplicitArgOffset <= NumFreeUserSGPRs * 4;
272+
}
273+
274+
// Try to allocate SGPRs to preload hidden kernel arguments.
275+
void
276+
tryAllocHiddenArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
277+
SmallVectorImpl<Function *> &FunctionsToErase) {
278+
Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists(
279+
F.getParent(), Intrinsic::amdgcn_implicitarg_ptr);
280+
if (!ImplicitArgPtr)
281+
return;
282+
283+
const DataLayout &DL = F.getParent()->getDataLayout();
284+
// Pair is the load and the load offset.
285+
SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads;
286+
for (auto *U : ImplicitArgPtr->users()) {
287+
Instruction *CI = dyn_cast<Instruction>(U);
288+
if (!CI || CI->getParent()->getParent() != &F)
289+
continue;
290+
291+
for (auto *U : CI->users()) {
292+
int64_t Offset = 0;
293+
auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
294+
if (!Load) {
295+
if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
296+
continue;
297+
298+
Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
299+
}
300+
301+
if (!Load || !Load->isSimple())
302+
continue;
303+
304+
// FIXME: Expand handle merged loads.
305+
LLVMContext &Ctx = F.getParent()->getContext();
306+
Type *LoadTy = Load->getType();
307+
HiddenArg HA = getHiddenArgFromOffset(Offset);
308+
if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))
309+
continue;
310+
311+
ImplicitArgLoads.push_back(std::make_pair(Load, Offset));
312+
}
313+
}
314+
315+
if (ImplicitArgLoads.empty())
316+
return;
317+
318+
// Allocate loads in order of offset. We need to be sure that the implicit
319+
// argument can actually be preloaded.
320+
std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second());
321+
322+
// If we fail to preload any implicit argument we know we don't have SGPRs
323+
// to preload any subsequent ones with larger offsets. Find the first
324+
// argument that we cannot preload.
325+
auto *PreloadEnd =
326+
std::find_if(ImplicitArgLoads.begin(), ImplicitArgLoads.end(),
327+
[&](const std::pair<LoadInst *, unsigned> &Load) {
328+
unsigned LoadSize =
329+
DL.getTypeStoreSize(Load.first->getType());
330+
unsigned LoadOffset = Load.second;
331+
if (!canPreloadKernArgAtOffset(LoadOffset + LoadSize +
332+
ImplicitArgsBaseOffset))
333+
return true;
334+
335+
return false;
336+
});
337+
338+
if (PreloadEnd == ImplicitArgLoads.begin())
339+
return;
340+
341+
unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second);
342+
Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex);
343+
assert(NF);
344+
FunctionsToErase.push_back(&F);
345+
for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) {
346+
LoadInst *LoadInst = I->first;
347+
unsigned LoadOffset = I->second;
348+
unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset);
349+
unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1;
350+
Argument *Arg = NF->getArg(Index);
351+
LoadInst->replaceAllUsesWith(Arg);
352+
}
353+
}
354+
};
355+
147356
class AMDGPUInformationCache : public InformationCache {
148357
public:
149358
AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
@@ -1314,19 +1523,64 @@ struct AAAMDGPUNoAGPR
13141523

13151524
const char AAAMDGPUNoAGPR::ID = 0;
13161525

1317-
static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
1318-
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
1319-
for (unsigned I = 0;
1320-
I < F.arg_size() &&
1321-
I < std::min(KernargPreloadCount.getValue(), ST.getMaxNumUserSGPRs());
1322-
++I) {
1323-
Argument &Arg = *F.getArg(I);
1324-
// Check for incompatible attributes.
1325-
if (Arg.hasByRefAttr() || Arg.hasNestAttr())
1326-
break;
1526+
static void markKernelArgsAsInreg(SetVector<Function *> &Functions,
1527+
TargetMachine &TM) {
1528+
SmallVector<Function *, 4> FunctionsToErase;
1529+
for (auto *F : Functions) {
1530+
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(*F);
1531+
if (!ST.hasKernargPreload() ||
1532+
F->getCallingConv() != CallingConv::AMDGPU_KERNEL || F->arg_empty())
1533+
continue;
1534+
1535+
PreloadKernelArgInfo PreloadInfo(*F, ST);
1536+
uint64_t ExplicitArgOffset = 0;
1537+
const DataLayout &DL = F->getDataLayout();
1538+
const uint64_t BaseOffset = ST.getExplicitKernelArgOffset();
1539+
unsigned NumPreloadsRequested = KernargPreloadCount;
1540+
unsigned NumPreloadedExplicitArgs = 0;
1541+
for (Argument &Arg : F->args()) {
1542+
// Avoid incompatible attributes and guard against running this pass
1543+
// twice.
1544+
if (Arg.hasByRefAttr() || Arg.hasNestAttr() ||
1545+
Arg.hasAttribute("amdgpu-hidden-argument"))
1546+
break;
1547+
1548+
// Inreg may be pre-existing on some arguments, try to preload these.
1549+
if (NumPreloadsRequested == 0 && !Arg.hasInRegAttr())
1550+
break;
1551+
1552+
// FIXME: Preload aggregates.
1553+
if (Arg.getType()->isAggregateType())
1554+
break;
1555+
1556+
Type *ArgTy = Arg.getType();
1557+
Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
1558+
uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
1559+
ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
1560+
if (!PreloadInfo.canPreloadKernArgAtOffset(ExplicitArgOffset))
1561+
break;
1562+
1563+
Arg.addAttr(Attribute::InReg);
1564+
NumPreloadedExplicitArgs++;
1565+
if (NumPreloadsRequested > 0)
1566+
NumPreloadsRequested--;
1567+
}
13271568

1328-
Arg.addAttr(Attribute::InReg);
1569+
// Only try preloading hidden arguments if we can successfully preload the
1570+
// last explicit argument.
1571+
if (NumPreloadedExplicitArgs == F->arg_size()) {
1572+
uint64_t ImplicitArgsBaseOffset =
1573+
alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +
1574+
BaseOffset;
1575+
PreloadInfo.tryAllocHiddenArgPreloadSGPRs(ImplicitArgsBaseOffset,
1576+
FunctionsToErase);
1577+
}
13291578
}
1579+
1580+
// Erase cloned functions if we needed to update the kernel signature to
1581+
// support preloading hidden kernel arguments.
1582+
for (auto *F : FunctionsToErase)
1583+
F->eraseFromParent();
13301584
}
13311585

13321586
static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
@@ -1378,8 +1632,6 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
13781632
if (!AMDGPU::isEntryFunctionCC(CC)) {
13791633
A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(*F));
13801634
A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(*F));
1381-
} else if (CC == CallingConv::AMDGPU_KERNEL) {
1382-
addPreloadKernArgHint(*F, TM);
13831635
}
13841636

13851637
for (auto &I : instructions(F)) {
@@ -1400,6 +1652,11 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
14001652
}
14011653

14021654
ChangeStatus Change = A.run();
1655+
1656+
// Mark kernel arguments with 'inreg' attribute to indicate that they should
1657+
// be preloaded into SGPRs.
1658+
markKernelArgsAsInreg(Functions, TM);
1659+
14031660
return Change == ChangeStatus::CHANGED;
14041661
}
14051662

0 commit comments

Comments
 (0)