@@ -27,231 +27,6 @@ using namespace llvm;
27
27
28
28
namespace {
29
29
30
- class PreloadKernelArgInfo {
31
- private:
32
- Function &F;
33
- const GCNSubtarget &ST;
34
- unsigned NumFreeUserSGPRs;
35
-
36
- enum HiddenArg : unsigned {
37
- HIDDEN_BLOCK_COUNT_X,
38
- HIDDEN_BLOCK_COUNT_Y,
39
- HIDDEN_BLOCK_COUNT_Z,
40
- HIDDEN_GROUP_SIZE_X,
41
- HIDDEN_GROUP_SIZE_Y,
42
- HIDDEN_GROUP_SIZE_Z,
43
- HIDDEN_REMAINDER_X,
44
- HIDDEN_REMAINDER_Y,
45
- HIDDEN_REMAINDER_Z,
46
- END_HIDDEN_ARGS
47
- };
48
-
49
- // Stores information about a specific hidden argument.
50
- struct HiddenArgInfo {
51
- // Offset in bytes from the location in the kernearg segment pointed to by
52
- // the implicitarg pointer.
53
- uint8_t Offset;
54
- // The size of the hidden argument in bytes.
55
- uint8_t Size ;
56
- // The name of the hidden argument in the kernel signature.
57
- const char *Name;
58
- };
59
-
60
- static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
61
- {0 , 4 , " _hidden_block_count_x" }, {4 , 4 , " _hidden_block_count_y" },
62
- {8 , 4 , " _hidden_block_count_z" }, {12 , 2 , " _hidden_group_size_x" },
63
- {14 , 2 , " _hidden_group_size_y" }, {16 , 2 , " _hidden_group_size_z" },
64
- {18 , 2 , " _hidden_remainder_x" }, {20 , 2 , " _hidden_remainder_y" },
65
- {22 , 2 , " _hidden_remainder_z" }};
66
-
67
- static HiddenArg getHiddenArgFromOffset (unsigned Offset) {
68
- for (unsigned I = 0 ; I < END_HIDDEN_ARGS; ++I)
69
- if (HiddenArgs[I].Offset == Offset)
70
- return static_cast <HiddenArg>(I);
71
-
72
- return END_HIDDEN_ARGS;
73
- }
74
-
75
- static Type *getHiddenArgType (LLVMContext &Ctx, HiddenArg HA) {
76
- if (HA < END_HIDDEN_ARGS)
77
- return Type::getIntNTy (Ctx, HiddenArgs[HA].Size * 8 );
78
-
79
- llvm_unreachable (" Unexpected hidden argument." );
80
- }
81
-
82
- static const char *getHiddenArgName (HiddenArg HA) {
83
- if (HA < END_HIDDEN_ARGS) {
84
- return HiddenArgs[HA].Name ;
85
- }
86
- llvm_unreachable (" Unexpected hidden argument." );
87
- }
88
-
89
- // Clones the function after adding implicit arguments to the argument list
90
- // and returns the new updated function. Preloaded implicit arguments are
91
- // added up to and including the last one that will be preloaded, indicated by
92
- // LastPreloadIndex. Currently preloading is only performed on the totality of
93
- // sequential data from the kernarg segment including implicit (hidden)
94
- // arguments. This means that all arguments up to the last preloaded argument
95
- // will also be preloaded even if that data is unused.
96
- Function *cloneFunctionWithPreloadImplicitArgs (unsigned LastPreloadIndex) {
97
- FunctionType *FT = F.getFunctionType ();
98
- LLVMContext &Ctx = F.getParent ()->getContext ();
99
- SmallVector<Type *, 16 > FTypes (FT->param_begin (), FT->param_end ());
100
- for (unsigned I = 0 ; I <= LastPreloadIndex; ++I)
101
- FTypes.push_back (getHiddenArgType (Ctx, HiddenArg (I)));
102
-
103
- FunctionType *NFT =
104
- FunctionType::get (FT->getReturnType (), FTypes, FT->isVarArg ());
105
- Function *NF =
106
- Function::Create (NFT, F.getLinkage (), F.getAddressSpace (), F.getName ());
107
-
108
- NF->copyAttributesFrom (&F);
109
- NF->copyMetadata (&F, 0 );
110
- NF->setIsNewDbgInfoFormat (F.IsNewDbgInfoFormat );
111
-
112
- F.getParent ()->getFunctionList ().insert (F.getIterator (), NF);
113
- NF->takeName (&F);
114
- NF->splice (NF->begin (), &F);
115
-
116
- Function::arg_iterator NFArg = NF->arg_begin ();
117
- for (Argument &Arg : F.args ()) {
118
- Arg.replaceAllUsesWith (&*NFArg);
119
- NFArg->takeName (&Arg);
120
- ++NFArg;
121
- }
122
-
123
- AttrBuilder AB (Ctx);
124
- AB.addAttribute (Attribute::InReg);
125
- AB.addAttribute (" amdgpu-hidden-argument" );
126
- AttributeList AL = NF->getAttributes ();
127
- for (unsigned I = 0 ; I <= LastPreloadIndex; ++I) {
128
- AL = AL.addParamAttributes (Ctx, NFArg->getArgNo (), AB);
129
- NFArg++->setName (getHiddenArgName (HiddenArg (I)));
130
- }
131
-
132
- NF->setAttributes (AL);
133
- F.replaceAllUsesWith (NF);
134
- F.setCallingConv (CallingConv::C);
135
- F.clearMetadata ();
136
-
137
- return NF;
138
- }
139
-
140
- public:
141
- PreloadKernelArgInfo (Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
142
- setInitialFreeUserSGPRsCount ();
143
- }
144
-
145
- // Returns the maximum number of user SGPRs that we have available to preload
146
- // arguments.
147
- void setInitialFreeUserSGPRsCount () {
148
- GCNUserSGPRUsageInfo UserSGPRInfo (F, ST);
149
- NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs ();
150
- }
151
-
152
- bool tryAllocPreloadSGPRs (unsigned AllocSize, uint64_t ArgOffset,
153
- uint64_t LastExplicitArgOffset) {
154
- // Check if this argument may be loaded into the same register as the
155
- // previous argument.
156
- if (ArgOffset - LastExplicitArgOffset < 4 &&
157
- !isAligned (Align (4 ), ArgOffset))
158
- return true ;
159
-
160
- // Pad SGPRs for kernarg alignment.
161
- ArgOffset = alignDown (ArgOffset, 4 );
162
- unsigned Padding = ArgOffset - LastExplicitArgOffset;
163
- unsigned PaddingSGPRs = alignTo (Padding, 4 ) / 4 ;
164
- unsigned NumPreloadSGPRs = alignTo (AllocSize, 4 ) / 4 ;
165
- if (NumPreloadSGPRs + PaddingSGPRs > NumFreeUserSGPRs)
166
- return false ;
167
-
168
- NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
169
- return true ;
170
- }
171
-
172
- // Try to allocate SGPRs to preload implicit kernel arguments.
173
- void tryAllocImplicitArgPreloadSGPRs (uint64_t ImplicitArgsBaseOffset,
174
- uint64_t LastExplicitArgOffset,
175
- IRBuilder<> &Builder) {
176
- Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists (
177
- F.getParent (), Intrinsic::amdgcn_implicitarg_ptr);
178
- if (!ImplicitArgPtr)
179
- return ;
180
-
181
- const DataLayout &DL = F.getParent ()->getDataLayout ();
182
- // Pair is the load and the load offset.
183
- SmallVector<std::pair<LoadInst *, unsigned >, 4 > ImplicitArgLoads;
184
- for (auto *U : ImplicitArgPtr->users ()) {
185
- Instruction *CI = dyn_cast<Instruction>(U);
186
- if (!CI || CI->getParent ()->getParent () != &F)
187
- continue ;
188
-
189
- for (auto *U : CI->users ()) {
190
- int64_t Offset = 0 ;
191
- auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
192
- if (!Load) {
193
- if (GetPointerBaseWithConstantOffset (U, Offset, DL) != CI)
194
- continue ;
195
-
196
- Load = dyn_cast<LoadInst>(*U->user_begin ()); // Load from GEP?
197
- }
198
-
199
- if (!Load || !Load->isSimple ())
200
- continue ;
201
-
202
- // FIXME: Expand to handle 64-bit implicit args and large merged loads.
203
- LLVMContext &Ctx = F.getParent ()->getContext ();
204
- Type *LoadTy = Load->getType ();
205
- HiddenArg HA = getHiddenArgFromOffset (Offset);
206
- if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType (Ctx, HA))
207
- continue ;
208
-
209
- ImplicitArgLoads.push_back (std::make_pair (Load, Offset));
210
- }
211
- }
212
-
213
- if (ImplicitArgLoads.empty ())
214
- return ;
215
-
216
- // Allocate loads in order of offset. We need to be sure that the implicit
217
- // argument can actually be preloaded.
218
- std::sort (ImplicitArgLoads.begin (), ImplicitArgLoads.end (), less_second ());
219
-
220
- // If we fail to preload any implicit argument we know we don't have SGPRs
221
- // to preload any subsequent ones with larger offsets. Find the first
222
- // argument that we cannot preload.
223
- auto *PreloadEnd = std::find_if (
224
- ImplicitArgLoads.begin (), ImplicitArgLoads.end (),
225
- [&](const std::pair<LoadInst *, unsigned > &Load) {
226
- unsigned LoadSize = DL.getTypeStoreSize (Load.first ->getType ());
227
- unsigned LoadOffset = Load.second ;
228
- if (!tryAllocPreloadSGPRs (LoadSize,
229
- LoadOffset + ImplicitArgsBaseOffset,
230
- LastExplicitArgOffset))
231
- return true ;
232
-
233
- LastExplicitArgOffset =
234
- ImplicitArgsBaseOffset + LoadOffset + LoadSize;
235
- return false ;
236
- });
237
-
238
- if (PreloadEnd == ImplicitArgLoads.begin ())
239
- return ;
240
-
241
- unsigned LastHiddenArgIndex = getHiddenArgFromOffset (PreloadEnd[-1 ].second );
242
- Function *NF = cloneFunctionWithPreloadImplicitArgs (LastHiddenArgIndex);
243
- assert (NF);
244
- for (const auto *I = ImplicitArgLoads.begin (); I != PreloadEnd; ++I) {
245
- LoadInst *LoadInst = I->first ;
246
- unsigned LoadOffset = I->second ;
247
- unsigned HiddenArgIndex = getHiddenArgFromOffset (LoadOffset);
248
- unsigned Index = NF->arg_size () - LastHiddenArgIndex + HiddenArgIndex - 1 ;
249
- Argument *Arg = NF->getArg (Index);
250
- LoadInst->replaceAllUsesWith (Arg);
251
- }
252
- }
253
- };
254
-
255
30
class AMDGPULowerKernelArguments : public FunctionPass {
256
31
public:
257
32
static char ID;
@@ -311,10 +86,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
311
86
Attribute::getWithDereferenceableBytes (Ctx, TotalKernArgSize));
312
87
313
88
uint64_t ExplicitArgOffset = 0 ;
314
- // Preloaded kernel arguments must be sequential.
315
- bool InPreloadSequence = true ;
316
- PreloadKernelArgInfo PreloadInfo (F, ST);
317
-
318
89
for (Argument &Arg : F.args ()) {
319
90
const bool IsByRef = Arg.hasByRefAttr ();
320
91
Type *ArgTy = IsByRef ? Arg.getParamByRefType () : Arg.getType ();
@@ -325,25 +96,10 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
325
96
uint64_t AllocSize = DL.getTypeAllocSize (ArgTy);
326
97
327
98
uint64_t EltOffset = alignTo (ExplicitArgOffset, ABITypeAlign) + BaseOffset;
328
- uint64_t LastExplicitArgOffset = ExplicitArgOffset;
329
99
ExplicitArgOffset = alignTo (ExplicitArgOffset, ABITypeAlign) + AllocSize;
330
100
331
- // Guard against the situation where hidden arguments have already been
332
- // lowered and added to the kernel function signiture, i.e. in a situation
333
- // where this pass has run twice.
334
- if (Arg.hasAttribute (" amdgpu-hidden-argument" ))
335
- break ;
336
-
337
- // Try to preload this argument into user SGPRs.
338
- if (Arg.hasInRegAttr () && InPreloadSequence && ST.hasKernargPreload () &&
339
- !Arg.getType ()->isAggregateType ())
340
- if (PreloadInfo.tryAllocPreloadSGPRs (AllocSize, EltOffset,
341
- LastExplicitArgOffset))
342
- continue ;
343
-
344
- InPreloadSequence = false ;
345
-
346
- if (Arg.use_empty ())
101
+ // Skip inreg arguments which should be preloaded.
102
+ if (Arg.use_empty () || Arg.hasInRegAttr ())
347
103
continue ;
348
104
349
105
// If this is byval, the loads are already explicit in the function. We just
@@ -483,14 +239,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
483
239
KernArgSegment->addRetAttr (
484
240
Attribute::getWithAlignment (Ctx, std::max (KernArgBaseAlign, MaxAlign)));
485
241
486
- if (InPreloadSequence) {
487
- uint64_t ImplicitArgsBaseOffset =
488
- alignTo (ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr ()) +
489
- BaseOffset;
490
- PreloadInfo.tryAllocImplicitArgPreloadSGPRs (ImplicitArgsBaseOffset,
491
- ExplicitArgOffset, Builder);
492
- }
493
-
494
242
return true ;
495
243
}
496
244
0 commit comments