-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[AMDGPU][NFCI] Remove preload kernarg alloc dep on DAG isel path #96030
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][NFCI] Remove preload kernarg alloc dep on DAG isel path #96030
Conversation
Makes the function allocatePreloadKernArgSGPRs callable from DAG isel and GIsel paths in preparation for supporting preloading kernargs with global isel. This is also a prerequisite for some refactoring of preload kernargs to allow preloading non-consecutive arguments.
@llvm/pr-subscribers-backend-amdgpu Author: Austin Kerbow (kerbowa) ChangesMakes the function allocatePreloadKernArgSGPRs callable from DAG isel and GIsel paths in preparation for supporting preloading kernargs with global isel. This is also a prerequisite for some refactoring of preload kernargs to allow preloading non-consecutive arguments. Full diff: https://github.com/llvm/llvm-project/pull/96030.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c436e03806dc8..9f05e9245a3b2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2465,71 +2465,94 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
// these from the dispatch pointer.
}
+static bool allocPreloadKernArg(uint64_t &LastExplicitArgOffset,
+ uint64_t ExplicitArgOffset, uint64_t ArgOffset,
+ unsigned ArgSize, unsigned Idx,
+ MachineFunction &MF, const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info, CCState &CCInfo) {
+ if (ArgOffset >= ExplicitArgOffset)
+ return false;
+
+ const Align KernelArgBaseAlign = Align(16);
+ Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
+ unsigned NumAllocSGPRs = alignTo(ArgSize, 4) / 4;
+
+ // Arg is preloaded into the previous SGPR.
+ if (ArgSize < 4 && Alignment < 4) {
+ Info.getArgInfo().PreloadKernArgs[Idx].Regs.push_back(
+ Info.getArgInfo().PreloadKernArgs[Idx - 1].Regs[0]);
+ return true;
+ }
+
+ unsigned Padding = ArgOffset - LastExplicitArgOffset;
+ unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
+ // Check for free user SGPRs for preloading.
+ if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
+ Info.getUserSGPRInfo().getNumFreeUserSGPRs()) {
+ return false;
+ }
+
+ // Preload this argument.
+ const TargetRegisterClass *RC =
+ TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
+ SmallVectorImpl<MCRegister> *PreloadRegs =
+ Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, Idx, PaddingSGPRs);
+
+ if (PreloadRegs->size() > 1)
+ RC = &AMDGPU::SGPR_32RegClass;
+
+ for (auto &Reg : *PreloadRegs) {
+ assert(Reg);
+ MF.addLiveIn(Reg, RC);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
+ return true;
+}
+
// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
// sequential starting from the first argument.
void SITargetLowering::allocatePreloadKernArgSGPRs(
- CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
- const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
+ CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs, MachineFunction &MF,
const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
Function &F = MF.getFunction();
- unsigned LastExplicitArgOffset =
- MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset();
- GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
- bool InPreloadSequence = true;
- unsigned InIdx = 0;
+ const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset();
+ uint64_t ExplicitArgOffset = BaseOffset;
+ uint64_t LastExplicitArgOffset = ExplicitArgOffset;
+ unsigned LocIdx = 0;
for (auto &Arg : F.args()) {
- if (!InPreloadSequence || !Arg.hasInRegAttr())
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ const bool IsByRef = Arg.hasByRefAttr();
+ Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
+ unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
+ if (AllocSize == 0)
break;
- int ArgIdx = Arg.getArgNo();
- // Don't preload non-original args or parts not in the current preload
- // sequence.
- if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
- (int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
+ MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt;
+ Align ABIAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy);
+ uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
+ ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
+ if (!Arg.hasInRegAttr())
break;
- for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
- (int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
- InIdx++) {
- assert(ArgLocs[ArgIdx].isMemLoc());
- auto &ArgLoc = ArgLocs[InIdx];
- const Align KernelArgBaseAlign = Align(16);
- unsigned ArgOffset = ArgLoc.getLocMemOffset();
- Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
- unsigned NumAllocSGPRs =
- alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
-
- // Arg is preloaded into the previous SGPR.
- if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
- Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
- Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
- continue;
- }
-
- unsigned Padding = ArgOffset - LastExplicitArgOffset;
- unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
- // Check for free user SGPRs for preloading.
- if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
- SGPRInfo.getNumFreeUserSGPRs()) {
- InPreloadSequence = false;
- break;
- }
-
- // Preload this argument.
- const TargetRegisterClass *RC =
- TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
- SmallVectorImpl<MCRegister> *PreloadRegs =
- Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
-
- if (PreloadRegs->size() > 1)
- RC = &AMDGPU::SGPR_32RegClass;
- for (auto &Reg : *PreloadRegs) {
- assert(Reg);
- MF.addLiveIn(Reg, RC);
- CCInfo.AllocateReg(Reg);
+ if (!ArgLocs.size()) {
+ // global isel
+ allocPreloadKernArg(LastExplicitArgOffset, ExplicitArgOffset,
+ ArgOffset, AllocSize, Arg.getArgNo(), MF,
+ TRI, Info, CCInfo);
+ } else {
+ // DAG isel
+ for (; LocIdx < ArgLocs.size(); LocIdx++) {
+ CCValAssign &ArgLoc = ArgLocs[LocIdx];
+ assert(ArgLoc.isMemLoc());
+ uint64_t LocOffset = ArgLoc.getLocMemOffset();
+ unsigned LocSize = ArgLoc.getLocVT().getStoreSize();
+ if (!allocPreloadKernArg(LastExplicitArgOffset, ExplicitArgOffset,
+ LocOffset, LocSize, LocIdx, MF, TRI, Info,
+ CCInfo))
+ break;
}
-
- LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
}
}
}
@@ -2854,7 +2877,7 @@ SDValue SITargetLowering::LowerFormalArguments(
allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
if (IsKernel && Subtarget->hasKernargPreload())
- allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
+ allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, MF, *TRI, *Info);
allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
} else if (!IsGraphics) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 1f198a92c0fa6..077f66e726a60 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -563,7 +563,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
void allocatePreloadKernArgSGPRs(CCState &CCInfo,
SmallVectorImpl<CCValAssign> &ArgLocs,
- const SmallVectorImpl<ISD::InputArg> &Ins,
MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) const;
|
Align ABIAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy); | ||
uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset; | ||
ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize; | ||
if (!Arg.hasInRegAttr()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: You can move this at the top of the loop, there's no need to compute all these offsets and whatnot if we're going to bail out anyway.
const bool IsByRef = Arg.hasByRefAttr(); | ||
Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); | ||
unsigned AllocSize = DL.getTypeAllocSize(ArgTy); | ||
if (AllocSize == 0) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why break and not continue?
TRI, Info, CCInfo); | ||
} else { | ||
// DAG isel | ||
for (; LocIdx < ArgLocs.size(); LocIdx++) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: Are you planning to use LocIdx for anything else? If not, you can probably move the declaration here.
(int)Ins[InIdx].getOrigArgIndex() != ArgIdx)) | ||
MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt; | ||
Align ABIAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy); | ||
uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: You can probably move ArgOffset closer to its use.
if (PreloadRegs->size() > 1) | ||
RC = &AMDGPU::SGPR_32RegClass; | ||
|
||
for (auto &Reg : *PreloadRegs) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No auto, no reference
for (auto &Arg : F.args()) { | ||
if (!InPreloadSequence || !Arg.hasInRegAttr()) | ||
const DataLayout &DL = F.getParent()->getDataLayout(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pull out of loop
const DataLayout &DL = F.getParent()->getDataLayout(); | ||
const bool IsByRef = Arg.hasByRefAttr(); | ||
Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); | ||
unsigned AllocSize = DL.getTypeAllocSize(ArgTy); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use TypeSize?
MF.addLiveIn(Reg, RC); | ||
CCInfo.AllocateReg(Reg); | ||
if (!ArgLocs.size()) { | ||
// global isel |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The DAG and GlobalISel paths should be identical here. The GlobalISel calling convention stuff is still just a shim on top of the same handling
Makes the function allocatePreloadKernArgSGPRs callable from DAG isel and GIsel paths in preparation for supporting preloading kernargs with global isel. This is also a prerequisite for some refactoring of preload kernargs to allow preloading non-consecutive arguments.