-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[AMDGPU] Support preloading hidden kernel arguments #98861
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,6 +13,8 @@ | |
|
||
#include "AMDGPU.h" | ||
#include "GCNSubtarget.h" | ||
#include "llvm/ADT/StringExtras.h" | ||
#include "llvm/Analysis/ValueTracking.h" | ||
#include "llvm/CodeGen/TargetPassConfig.h" | ||
#include "llvm/IR/IRBuilder.h" | ||
#include "llvm/IR/IntrinsicsAMDGPU.h" | ||
|
@@ -31,9 +33,110 @@ class PreloadKernelArgInfo { | |
const GCNSubtarget &ST; | ||
unsigned NumFreeUserSGPRs; | ||
|
||
public: | ||
SmallVector<llvm::Metadata *, 8> KernelArgMetadata; | ||
enum HiddenArg : unsigned { | ||
HIDDEN_BLOCK_COUNT_X, | ||
HIDDEN_BLOCK_COUNT_Y, | ||
HIDDEN_BLOCK_COUNT_Z, | ||
HIDDEN_GROUP_SIZE_X, | ||
HIDDEN_GROUP_SIZE_Y, | ||
HIDDEN_GROUP_SIZE_Z, | ||
HIDDEN_REMAINDER_X, | ||
HIDDEN_REMAINDER_Y, | ||
HIDDEN_REMAINDER_Z, | ||
END_HIDDEN_ARGS | ||
}; | ||
|
||
// Stores information about a specific hidden argument. | ||
struct HiddenArgInfo { | ||
// Offset in bytes from the location in the kernearg segment pointed to by | ||
// the implicitarg pointer. | ||
uint8_t Offset; | ||
// The size of the hidden argument in bytes. | ||
uint8_t Size; | ||
// The name of the hidden argument in the kernel signature. | ||
const char *Name; | ||
}; | ||
|
||
static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = { | ||
kerbowa marked this conversation as resolved.
Show resolved
Hide resolved
|
||
{0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"}, | ||
{8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"}, | ||
{14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"}, | ||
{18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"}, | ||
{22, 2, "_hidden_remainder_z"}}; | ||
|
||
static HiddenArg getHiddenArgFromOffset(unsigned Offset) { | ||
for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I) | ||
if (HiddenArgs[I].Offset == Offset) | ||
return static_cast<HiddenArg>(I); | ||
|
||
return END_HIDDEN_ARGS; | ||
} | ||
|
||
static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) { | ||
if (HA < END_HIDDEN_ARGS) | ||
return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8); | ||
|
||
llvm_unreachable("Unexpected hidden argument."); | ||
} | ||
|
||
static const char *getHiddenArgName(HiddenArg HA) { | ||
if (HA < END_HIDDEN_ARGS) { | ||
return HiddenArgs[HA].Name; | ||
} | ||
llvm_unreachable("Unexpected hidden argument."); | ||
} | ||
|
||
// Clones the function after adding implicit arguments to the argument list | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this need to defend against arguments already marked as hidden arguments, in case the pass runs twice? i.e. running this pass a second time should be a no-op |
||
// and returns the new updated function. Preloaded implicit arguments are | ||
// added up to and including the last one that will be preloaded, indicated by | ||
// LastPreloadIndex. Currently preloading is only performed on the totality of | ||
// sequential data from the kernarg segment including implicit (hidden) | ||
// arguments. This means that all arguments up to the last preloaded argument | ||
// will also be preloaded even if that data is unused. | ||
Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) { | ||
FunctionType *FT = F.getFunctionType(); | ||
LLVMContext &Ctx = F.getParent()->getContext(); | ||
SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end()); | ||
for (unsigned I = 0; I <= LastPreloadIndex; ++I) | ||
FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I))); | ||
|
||
FunctionType *NFT = | ||
FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg()); | ||
Function *NF = | ||
Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName()); | ||
|
||
NF->copyAttributesFrom(&F); | ||
NF->copyMetadata(&F, 0); | ||
NF->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should probably be in copyAttributesFrom |
||
|
||
F.getParent()->getFunctionList().insert(F.getIterator(), NF); | ||
NF->takeName(&F); | ||
NF->splice(NF->begin(), &F); | ||
|
||
Function::arg_iterator NFArg = NF->arg_begin(); | ||
for (Argument &Arg : F.args()) { | ||
Arg.replaceAllUsesWith(&*NFArg); | ||
NFArg->takeName(&Arg); | ||
kerbowa marked this conversation as resolved.
Show resolved
Hide resolved
|
||
++NFArg; | ||
} | ||
|
||
AttrBuilder AB(Ctx); | ||
AB.addAttribute(Attribute::InReg); | ||
AB.addAttribute("amdgpu-hidden-argument"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you document this attribute in AMDGPUUsage? |
||
AttributeList AL = NF->getAttributes(); | ||
for (unsigned I = 0; I <= LastPreloadIndex; ++I) { | ||
AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB); | ||
NFArg++->setName(getHiddenArgName(HiddenArg(I))); | ||
kerbowa marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
NF->setAttributes(AL); | ||
F.replaceAllUsesWith(NF); | ||
F.setCallingConv(CallingConv::C); | ||
|
||
return NF; | ||
} | ||
|
||
public: | ||
PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) { | ||
setInitialFreeUserSGPRsCount(); | ||
} | ||
|
@@ -64,6 +167,87 @@ class PreloadKernelArgInfo { | |
NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs); | ||
return true; | ||
} | ||
|
||
// Try to allocate SGPRs to preload implicit kernel arguments. | ||
void tryAllocImplicitArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset, | ||
IRBuilder<> &Builder) { | ||
StringRef Name = Intrinsic::getName(Intrinsic::amdgcn_implicitarg_ptr); | ||
Function *ImplicitArgPtr = F.getParent()->getFunction(Name); | ||
if (!ImplicitArgPtr) | ||
return; | ||
Comment on lines
+174
to
+177
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We really ought to have a version of getDeclaration that won't insert the declaration if it's not already there |
||
|
||
const DataLayout &DL = F.getParent()->getDataLayout(); | ||
// Pair is the load and the load offset. | ||
SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads; | ||
for (auto *U : ImplicitArgPtr->users()) { | ||
Instruction *CI = dyn_cast<Instruction>(U); | ||
if (!CI || CI->getParent()->getParent() != &F) | ||
continue; | ||
|
||
for (auto *U : CI->users()) { | ||
int64_t Offset = 0; | ||
auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr? | ||
if (!Load) { | ||
if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI) | ||
continue; | ||
|
||
Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP? | ||
} | ||
|
||
if (!Load || !Load->isSimple()) | ||
continue; | ||
Comment on lines
+197
to
+198
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can't think of a reason we wouldn't do this for volatile or atomic loads, but I guess can remove this later |
||
|
||
// FIXME: Expand to handle 64-bit implicit args and large merged loads. | ||
LLVMContext &Ctx = F.getParent()->getContext(); | ||
Type *LoadTy = Load->getType(); | ||
HiddenArg HA = getHiddenArgFromOffset(Offset); | ||
if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA)) | ||
continue; | ||
|
||
ImplicitArgLoads.push_back(std::make_pair(Load, Offset)); | ||
} | ||
} | ||
|
||
if (ImplicitArgLoads.empty()) | ||
return; | ||
|
||
// Allocate loads in order of offset. We need to be sure that the implicit | ||
// argument can actually be preloaded. | ||
std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second()); | ||
|
||
uint64_t LastExplicitArgOffset = ImplicitArgsBaseOffset; | ||
// If we fail to preload any implicit argument we know we don't have SGPRs | ||
// to preload any subsequent ones with larger offsets. Find the first | ||
// argument that we cannot preload. | ||
auto *PreloadEnd = std::find_if( | ||
ImplicitArgLoads.begin(), ImplicitArgLoads.end(), | ||
[&](const std::pair<LoadInst *, unsigned> &Load) { | ||
unsigned LoadSize = DL.getTypeStoreSize(Load.first->getType()); | ||
unsigned LoadOffset = Load.second; | ||
if (!tryAllocPreloadSGPRs(LoadSize, | ||
LoadOffset + ImplicitArgsBaseOffset, | ||
LastExplicitArgOffset)) | ||
return true; | ||
|
||
LastExplicitArgOffset = LoadOffset + LoadSize; | ||
return false; | ||
}); | ||
|
||
if (PreloadEnd == ImplicitArgLoads.begin()) | ||
return; | ||
|
||
unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second); | ||
Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex); | ||
assert(NF); | ||
for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) { | ||
LoadInst *LoadInst = I->first; | ||
unsigned LoadOffset = I->second; | ||
unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset); | ||
unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1; | ||
Argument *Arg = NF->getArg(Index); | ||
LoadInst->replaceAllUsesWith(Arg); | ||
} | ||
} | ||
}; | ||
|
||
class AMDGPULowerKernelArguments : public FunctionPass { | ||
|
@@ -142,6 +326,12 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { | |
uint64_t LastExplicitArgOffset = ExplicitArgOffset; | ||
ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize; | ||
|
||
// Guard against the situation where hidden arguments have already been | ||
// lowered and added to the kernel function signiture, i.e. in a situation | ||
// where this pass has run twice. | ||
if (Arg.hasAttribute("amdgpu-hidden-argument")) | ||
break; | ||
|
||
// Try to preload this argument into user SGPRs. | ||
if (Arg.hasInRegAttr() && InPreloadSequence && ST.hasKernargPreload() && | ||
!Arg.getType()->isAggregateType()) | ||
|
@@ -281,6 +471,14 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { | |
KernArgSegment->addRetAttr( | ||
Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign))); | ||
|
||
if (InPreloadSequence) { | ||
uint64_t ImplicitArgsBaseOffset = | ||
alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) + | ||
BaseOffset; | ||
PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset, | ||
Builder); | ||
arsenm marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
return true; | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also inreg