Skip to content

[AMDGPU] Move kernarg preload logic to AMDGPU Attributor #123547

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
285 changes: 272 additions & 13 deletions llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
#include "GCNSubtarget.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/Analysis/CycleAnalysis.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/Target/TargetMachine.h"
Expand Down Expand Up @@ -144,6 +146,213 @@ static bool funcRequiresHostcallPtr(const Function &F) {
}

namespace {

class PreloadKernelArgInfo {
private:
Function &F;
const GCNSubtarget &ST;
unsigned NumFreeUserSGPRs;

enum HiddenArg : unsigned {
HIDDEN_BLOCK_COUNT_X,
HIDDEN_BLOCK_COUNT_Y,
HIDDEN_BLOCK_COUNT_Z,
HIDDEN_GROUP_SIZE_X,
HIDDEN_GROUP_SIZE_Y,
HIDDEN_GROUP_SIZE_Z,
HIDDEN_REMAINDER_X,
HIDDEN_REMAINDER_Y,
HIDDEN_REMAINDER_Z,
END_HIDDEN_ARGS
};

// Stores information about a specific hidden argument.
struct HiddenArgInfo {
// Offset in bytes from the location in the kernearg segment pointed to by
// the implicitarg pointer.
uint8_t Offset;
// The size of the hidden argument in bytes.
uint8_t Size;
// The name of the hidden argument in the kernel signature.
const char *Name;
};

static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
{0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},
{8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},
{14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},
{18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"},
{22, 2, "_hidden_remainder_z"}};

static HiddenArg getHiddenArgFromOffset(unsigned Offset) {
for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
if (HiddenArgs[I].Offset == Offset)
return static_cast<HiddenArg>(I);

return END_HIDDEN_ARGS;
}

static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {
if (HA < END_HIDDEN_ARGS)
return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8);

llvm_unreachable("Unexpected hidden argument.");
}

static const char *getHiddenArgName(HiddenArg HA) {
if (HA < END_HIDDEN_ARGS) {
return HiddenArgs[HA].Name;
}
llvm_unreachable("Unexpected hidden argument.");
}

// Clones the function after adding implicit arguments to the argument list
// and returns the new updated function. Preloaded implicit arguments are
// added up to and including the last one that will be preloaded, indicated by
// LastPreloadIndex. Currently preloading is only performed on the totality of
// sequential data from the kernarg segment including implicit (hidden)
// arguments. This means that all arguments up to the last preloaded argument
// will also be preloaded even if that data is unused.
Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) {
FunctionType *FT = F.getFunctionType();
LLVMContext &Ctx = F.getParent()->getContext();
SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end());
for (unsigned I = 0; I <= LastPreloadIndex; ++I)
FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I)));

FunctionType *NFT =
FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());
Function *NF =
Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName());

NF->copyAttributesFrom(&F);
NF->copyMetadata(&F, 0);
NF->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat);

F.getParent()->getFunctionList().insert(F.getIterator(), NF);
NF->takeName(&F);
NF->splice(NF->begin(), &F);

Function::arg_iterator NFArg = NF->arg_begin();
for (Argument &Arg : F.args()) {
Arg.replaceAllUsesWith(&*NFArg);
NFArg->takeName(&Arg);
++NFArg;
}

AttrBuilder AB(Ctx);
AB.addAttribute(Attribute::InReg);
AB.addAttribute("amdgpu-hidden-argument");
AttributeList AL = NF->getAttributes();
for (unsigned I = 0; I <= LastPreloadIndex; ++I) {
AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB);
NFArg++->setName(getHiddenArgName(HiddenArg(I)));
}

NF->setAttributes(AL);
F.replaceAllUsesWith(NF);

return NF;
}

public:
PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
setInitialFreeUserSGPRsCount();
}

// Returns the maximum number of user SGPRs that we have available to preload
// arguments.
void setInitialFreeUserSGPRsCount() {
GCNUserSGPRUsageInfo UserSGPRInfo(F, ST);
NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs();
}

bool canPreloadKernArgAtOffset(uint64_t ExplicitArgOffset) {
return ExplicitArgOffset <= NumFreeUserSGPRs * 4;
}

// Try to allocate SGPRs to preload hidden kernel arguments.
void
tryAllocHiddenArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
SmallVectorImpl<Function *> &FunctionsToErase) {
Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists(
F.getParent(), Intrinsic::amdgcn_implicitarg_ptr);
if (!ImplicitArgPtr)
return;

const DataLayout &DL = F.getParent()->getDataLayout();
// Pair is the load and the load offset.
SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads;
for (auto *U : ImplicitArgPtr->users()) {
Instruction *CI = dyn_cast<Instruction>(U);
if (!CI || CI->getParent()->getParent() != &F)
continue;

for (auto *U : CI->users()) {
int64_t Offset = 0;
auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
if (!Load) {
if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
continue;

Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
}

if (!Load || !Load->isSimple())
continue;

// FIXME: Expand handle merged loads.
LLVMContext &Ctx = F.getParent()->getContext();
Type *LoadTy = Load->getType();
HiddenArg HA = getHiddenArgFromOffset(Offset);
if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))
continue;

ImplicitArgLoads.push_back(std::make_pair(Load, Offset));
}
}

if (ImplicitArgLoads.empty())
return;

// Allocate loads in order of offset. We need to be sure that the implicit
// argument can actually be preloaded.
std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second());

// If we fail to preload any implicit argument we know we don't have SGPRs
// to preload any subsequent ones with larger offsets. Find the first
// argument that we cannot preload.
auto *PreloadEnd =
std::find_if(ImplicitArgLoads.begin(), ImplicitArgLoads.end(),
[&](const std::pair<LoadInst *, unsigned> &Load) {
unsigned LoadSize =
DL.getTypeStoreSize(Load.first->getType());
unsigned LoadOffset = Load.second;
if (!canPreloadKernArgAtOffset(LoadOffset + LoadSize +
ImplicitArgsBaseOffset))
return true;

return false;
});

if (PreloadEnd == ImplicitArgLoads.begin())
return;

unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second);
Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex);
assert(NF);
FunctionsToErase.push_back(&F);
for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) {
LoadInst *LoadInst = I->first;
unsigned LoadOffset = I->second;
unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset);
unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1;
Argument *Arg = NF->getArg(Index);
LoadInst->replaceAllUsesWith(Arg);
}
}
};

class AMDGPUInformationCache : public InformationCache {
public:
AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
Expand Down Expand Up @@ -1314,19 +1523,66 @@ struct AAAMDGPUNoAGPR

const char AAAMDGPUNoAGPR::ID = 0;

static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
for (unsigned I = 0;
I < F.arg_size() &&
I < std::min(KernargPreloadCount.getValue(), ST.getMaxNumUserSGPRs());
++I) {
Argument &Arg = *F.getArg(I);
// Check for incompatible attributes.
if (Arg.hasByRefAttr() || Arg.hasNestAttr())
break;
static void markKernelArgsAsInreg(SetVector<Function *> &Functions,
TargetMachine &TM) {
SmallVector<Function *, 4> FunctionsToErase;
for (auto *F : Functions) {
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(*F);
if (!ST.hasKernargPreload() ||
F->getCallingConv() != CallingConv::AMDGPU_KERNEL)
continue;

PreloadKernelArgInfo PreloadInfo(*F, ST);
uint64_t ExplicitArgOffset = 0;
const DataLayout &DL = F->getDataLayout();
const uint64_t BaseOffset = ST.getExplicitKernelArgOffset();
unsigned NumPreloadsRequested = KernargPreloadCount;
unsigned NumPreloadedExplicitArgs = 0;
for (Argument &Arg : F->args()) {
// Avoid incompatible attributes and guard against running this pass
// twice.
//
// TODO: Preload byref kernel arguments
if (Arg.hasByRefAttr() || Arg.hasNestAttr() ||
Arg.hasAttribute("amdgpu-hidden-argument"))
break;

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should todo handle byref, in the future all kernel arguments should be byref

// Inreg may be pre-existing on some arguments, try to preload these.
if (NumPreloadsRequested == 0 && !Arg.hasInRegAttr())
break;

// FIXME: Preload aggregates.
if (Arg.getType()->isAggregateType())
break;

Type *ArgTy = Arg.getType();
Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
Comment on lines +1559 to +1561
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You probably should be using the TargetLowering functions the calling convention lowering uses to see how these types will really be processed

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the way I'm doing it is what we want since AMDGPUCallLowering::lowerFormalArgumentsKernel and HSA metadata streamer calculates the argument offsets this same way, so it should be consistent with how they are processed.

if (!PreloadInfo.canPreloadKernArgAtOffset(ExplicitArgOffset))
break;

Arg.addAttr(Attribute::InReg);
NumPreloadedExplicitArgs++;
if (NumPreloadsRequested > 0)
NumPreloadsRequested--;
}

Arg.addAttr(Attribute::InReg);
// Only try preloading hidden arguments if we can successfully preload the
// last explicit argument.
if (NumPreloadedExplicitArgs == F->arg_size()) {
uint64_t ImplicitArgsBaseOffset =
alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +
BaseOffset;
PreloadInfo.tryAllocHiddenArgPreloadSGPRs(ImplicitArgsBaseOffset,
FunctionsToErase);
}
}

// Erase cloned functions if we needed to update the kernel signature to
// support preloading hidden kernel arguments.
for (auto *F : FunctionsToErase)
F->eraseFromParent();
}

static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
Expand Down Expand Up @@ -1378,8 +1634,6 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
if (!AMDGPU::isEntryFunctionCC(CC)) {
A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(*F));
A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(*F));
} else if (CC == CallingConv::AMDGPU_KERNEL) {
addPreloadKernArgHint(*F, TM);
}

for (auto &I : instructions(F)) {
Expand All @@ -1400,6 +1654,11 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
}

ChangeStatus Change = A.run();

// Mark kernel arguments with 'inreg' attribute to indicate that they should
// be preloaded into SGPRs.
markKernelArgsAsInreg(Functions, TM);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't reflect the change of the pass or not.


return Change == ChangeStatus::CHANGED;
}

Expand Down
Loading