14
14
#include " GCNSubtarget.h"
15
15
#include " Utils/AMDGPUBaseInfo.h"
16
16
#include " llvm/Analysis/CycleAnalysis.h"
17
+ #include " llvm/Analysis/ValueTracking.h"
17
18
#include " llvm/CodeGen/TargetPassConfig.h"
19
+ #include " llvm/IR/IRBuilder.h"
18
20
#include " llvm/IR/IntrinsicsAMDGPU.h"
19
21
#include " llvm/IR/IntrinsicsR600.h"
20
22
#include " llvm/Target/TargetMachine.h"
@@ -144,6 +146,213 @@ static bool funcRequiresHostcallPtr(const Function &F) {
144
146
}
145
147
146
148
namespace {
149
+
150
+ class PreloadKernelArgInfo {
151
+ private:
152
+ Function &F;
153
+ const GCNSubtarget &ST;
154
+ unsigned NumFreeUserSGPRs;
155
+
156
+ enum HiddenArg : unsigned {
157
+ HIDDEN_BLOCK_COUNT_X,
158
+ HIDDEN_BLOCK_COUNT_Y,
159
+ HIDDEN_BLOCK_COUNT_Z,
160
+ HIDDEN_GROUP_SIZE_X,
161
+ HIDDEN_GROUP_SIZE_Y,
162
+ HIDDEN_GROUP_SIZE_Z,
163
+ HIDDEN_REMAINDER_X,
164
+ HIDDEN_REMAINDER_Y,
165
+ HIDDEN_REMAINDER_Z,
166
+ END_HIDDEN_ARGS
167
+ };
168
+
169
+ // Stores information about a specific hidden argument.
170
+ struct HiddenArgInfo {
171
+ // Offset in bytes from the location in the kernearg segment pointed to by
172
+ // the implicitarg pointer.
173
+ uint8_t Offset;
174
+ // The size of the hidden argument in bytes.
175
+ uint8_t Size ;
176
+ // The name of the hidden argument in the kernel signature.
177
+ const char *Name;
178
+ };
179
+
180
+ static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
181
+ {0 , 4 , " _hidden_block_count_x" }, {4 , 4 , " _hidden_block_count_y" },
182
+ {8 , 4 , " _hidden_block_count_z" }, {12 , 2 , " _hidden_group_size_x" },
183
+ {14 , 2 , " _hidden_group_size_y" }, {16 , 2 , " _hidden_group_size_z" },
184
+ {18 , 2 , " _hidden_remainder_x" }, {20 , 2 , " _hidden_remainder_y" },
185
+ {22 , 2 , " _hidden_remainder_z" }};
186
+
187
+ static HiddenArg getHiddenArgFromOffset (unsigned Offset) {
188
+ for (unsigned I = 0 ; I < END_HIDDEN_ARGS; ++I)
189
+ if (HiddenArgs[I].Offset == Offset)
190
+ return static_cast <HiddenArg>(I);
191
+
192
+ return END_HIDDEN_ARGS;
193
+ }
194
+
195
+ static Type *getHiddenArgType (LLVMContext &Ctx, HiddenArg HA) {
196
+ if (HA < END_HIDDEN_ARGS)
197
+ return Type::getIntNTy (Ctx, HiddenArgs[HA].Size * 8 );
198
+
199
+ llvm_unreachable (" Unexpected hidden argument." );
200
+ }
201
+
202
+ static const char *getHiddenArgName (HiddenArg HA) {
203
+ if (HA < END_HIDDEN_ARGS) {
204
+ return HiddenArgs[HA].Name ;
205
+ }
206
+ llvm_unreachable (" Unexpected hidden argument." );
207
+ }
208
+
209
+ // Clones the function after adding implicit arguments to the argument list
210
+ // and returns the new updated function. Preloaded implicit arguments are
211
+ // added up to and including the last one that will be preloaded, indicated by
212
+ // LastPreloadIndex. Currently preloading is only performed on the totality of
213
+ // sequential data from the kernarg segment including implicit (hidden)
214
+ // arguments. This means that all arguments up to the last preloaded argument
215
+ // will also be preloaded even if that data is unused.
216
+ Function *cloneFunctionWithPreloadImplicitArgs (unsigned LastPreloadIndex) {
217
+ FunctionType *FT = F.getFunctionType ();
218
+ LLVMContext &Ctx = F.getParent ()->getContext ();
219
+ SmallVector<Type *, 16 > FTypes (FT->param_begin (), FT->param_end ());
220
+ for (unsigned I = 0 ; I <= LastPreloadIndex; ++I)
221
+ FTypes.push_back (getHiddenArgType (Ctx, HiddenArg (I)));
222
+
223
+ FunctionType *NFT =
224
+ FunctionType::get (FT->getReturnType (), FTypes, FT->isVarArg ());
225
+ Function *NF =
226
+ Function::Create (NFT, F.getLinkage (), F.getAddressSpace (), F.getName ());
227
+
228
+ NF->copyAttributesFrom (&F);
229
+ NF->copyMetadata (&F, 0 );
230
+ NF->setIsNewDbgInfoFormat (F.IsNewDbgInfoFormat );
231
+
232
+ F.getParent ()->getFunctionList ().insert (F.getIterator (), NF);
233
+ NF->takeName (&F);
234
+ NF->splice (NF->begin (), &F);
235
+
236
+ Function::arg_iterator NFArg = NF->arg_begin ();
237
+ for (Argument &Arg : F.args ()) {
238
+ Arg.replaceAllUsesWith (&*NFArg);
239
+ NFArg->takeName (&Arg);
240
+ ++NFArg;
241
+ }
242
+
243
+ AttrBuilder AB (Ctx);
244
+ AB.addAttribute (Attribute::InReg);
245
+ AB.addAttribute (" amdgpu-hidden-argument" );
246
+ AttributeList AL = NF->getAttributes ();
247
+ for (unsigned I = 0 ; I <= LastPreloadIndex; ++I) {
248
+ AL = AL.addParamAttributes (Ctx, NFArg->getArgNo (), AB);
249
+ NFArg++->setName (getHiddenArgName (HiddenArg (I)));
250
+ }
251
+
252
+ NF->setAttributes (AL);
253
+ F.replaceAllUsesWith (NF);
254
+
255
+ return NF;
256
+ }
257
+
258
+ public:
259
+ PreloadKernelArgInfo (Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
260
+ setInitialFreeUserSGPRsCount ();
261
+ }
262
+
263
+ // Returns the maximum number of user SGPRs that we have available to preload
264
+ // arguments.
265
+ void setInitialFreeUserSGPRsCount () {
266
+ GCNUserSGPRUsageInfo UserSGPRInfo (F, ST);
267
+ NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs ();
268
+ }
269
+
270
+ bool canPreloadKernArgAtOffset (uint64_t ExplicitArgOffset) {
271
+ return ExplicitArgOffset <= NumFreeUserSGPRs * 4 ;
272
+ }
273
+
274
+ // Try to allocate SGPRs to preload hidden kernel arguments.
275
+ void
276
+ tryAllocHiddenArgPreloadSGPRs (uint64_t ImplicitArgsBaseOffset,
277
+ SmallVectorImpl<Function *> &FunctionsToErase) {
278
+ Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists (
279
+ F.getParent (), Intrinsic::amdgcn_implicitarg_ptr);
280
+ if (!ImplicitArgPtr)
281
+ return ;
282
+
283
+ const DataLayout &DL = F.getParent ()->getDataLayout ();
284
+ // Pair is the load and the load offset.
285
+ SmallVector<std::pair<LoadInst *, unsigned >, 4 > ImplicitArgLoads;
286
+ for (auto *U : ImplicitArgPtr->users ()) {
287
+ Instruction *CI = dyn_cast<Instruction>(U);
288
+ if (!CI || CI->getParent ()->getParent () != &F)
289
+ continue ;
290
+
291
+ for (auto *U : CI->users ()) {
292
+ int64_t Offset = 0 ;
293
+ auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
294
+ if (!Load) {
295
+ if (GetPointerBaseWithConstantOffset (U, Offset, DL) != CI)
296
+ continue ;
297
+
298
+ Load = dyn_cast<LoadInst>(*U->user_begin ()); // Load from GEP?
299
+ }
300
+
301
+ if (!Load || !Load->isSimple ())
302
+ continue ;
303
+
304
+ // FIXME: Expand handle merged loads.
305
+ LLVMContext &Ctx = F.getParent ()->getContext ();
306
+ Type *LoadTy = Load->getType ();
307
+ HiddenArg HA = getHiddenArgFromOffset (Offset);
308
+ if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType (Ctx, HA))
309
+ continue ;
310
+
311
+ ImplicitArgLoads.push_back (std::make_pair (Load, Offset));
312
+ }
313
+ }
314
+
315
+ if (ImplicitArgLoads.empty ())
316
+ return ;
317
+
318
+ // Allocate loads in order of offset. We need to be sure that the implicit
319
+ // argument can actually be preloaded.
320
+ std::sort (ImplicitArgLoads.begin (), ImplicitArgLoads.end (), less_second ());
321
+
322
+ // If we fail to preload any implicit argument we know we don't have SGPRs
323
+ // to preload any subsequent ones with larger offsets. Find the first
324
+ // argument that we cannot preload.
325
+ auto *PreloadEnd =
326
+ std::find_if (ImplicitArgLoads.begin (), ImplicitArgLoads.end (),
327
+ [&](const std::pair<LoadInst *, unsigned > &Load) {
328
+ unsigned LoadSize =
329
+ DL.getTypeStoreSize (Load.first ->getType ());
330
+ unsigned LoadOffset = Load.second ;
331
+ if (!canPreloadKernArgAtOffset (LoadOffset + LoadSize +
332
+ ImplicitArgsBaseOffset))
333
+ return true ;
334
+
335
+ return false ;
336
+ });
337
+
338
+ if (PreloadEnd == ImplicitArgLoads.begin ())
339
+ return ;
340
+
341
+ unsigned LastHiddenArgIndex = getHiddenArgFromOffset (PreloadEnd[-1 ].second );
342
+ Function *NF = cloneFunctionWithPreloadImplicitArgs (LastHiddenArgIndex);
343
+ assert (NF);
344
+ FunctionsToErase.push_back (&F);
345
+ for (const auto *I = ImplicitArgLoads.begin (); I != PreloadEnd; ++I) {
346
+ LoadInst *LoadInst = I->first ;
347
+ unsigned LoadOffset = I->second ;
348
+ unsigned HiddenArgIndex = getHiddenArgFromOffset (LoadOffset);
349
+ unsigned Index = NF->arg_size () - LastHiddenArgIndex + HiddenArgIndex - 1 ;
350
+ Argument *Arg = NF->getArg (Index);
351
+ LoadInst->replaceAllUsesWith (Arg);
352
+ }
353
+ }
354
+ };
355
+
147
356
class AMDGPUInformationCache : public InformationCache {
148
357
public:
149
358
AMDGPUInformationCache (const Module &M, AnalysisGetter &AG,
@@ -1314,19 +1523,64 @@ struct AAAMDGPUNoAGPR
1314
1523
1315
1524
const char AAAMDGPUNoAGPR::ID = 0 ;
1316
1525
1317
- static void addPreloadKernArgHint (Function &F, TargetMachine &TM) {
1318
- const GCNSubtarget &ST = TM.getSubtarget <GCNSubtarget>(F);
1319
- for (unsigned I = 0 ;
1320
- I < F.arg_size () &&
1321
- I < std::min (KernargPreloadCount.getValue (), ST.getMaxNumUserSGPRs ());
1322
- ++I) {
1323
- Argument &Arg = *F.getArg (I);
1324
- // Check for incompatible attributes.
1325
- if (Arg.hasByRefAttr () || Arg.hasNestAttr ())
1326
- break ;
1526
+ static void markKernelArgsAsInreg (SetVector<Function *> &Functions,
1527
+ TargetMachine &TM) {
1528
+ SmallVector<Function *, 4 > FunctionsToErase;
1529
+ for (auto *F : Functions) {
1530
+ const GCNSubtarget &ST = TM.getSubtarget <GCNSubtarget>(*F);
1531
+ if (!ST.hasKernargPreload () ||
1532
+ F->getCallingConv () != CallingConv::AMDGPU_KERNEL || F->arg_empty ())
1533
+ continue ;
1534
+
1535
+ PreloadKernelArgInfo PreloadInfo (*F, ST);
1536
+ uint64_t ExplicitArgOffset = 0 ;
1537
+ const DataLayout &DL = F->getDataLayout ();
1538
+ const uint64_t BaseOffset = ST.getExplicitKernelArgOffset ();
1539
+ unsigned NumPreloadsRequested = KernargPreloadCount;
1540
+ unsigned NumPreloadedExplicitArgs = 0 ;
1541
+ for (Argument &Arg : F->args ()) {
1542
+ // Avoid incompatible attributes and guard against running this pass
1543
+ // twice.
1544
+ if (Arg.hasByRefAttr () || Arg.hasNestAttr () ||
1545
+ Arg.hasAttribute (" amdgpu-hidden-argument" ))
1546
+ break ;
1547
+
1548
+ // Inreg may be pre-existing on some arguments, try to preload these.
1549
+ if (NumPreloadsRequested == 0 && !Arg.hasInRegAttr ())
1550
+ break ;
1551
+
1552
+ // FIXME: Preload aggregates.
1553
+ if (Arg.getType ()->isAggregateType ())
1554
+ break ;
1555
+
1556
+ Type *ArgTy = Arg.getType ();
1557
+ Align ABITypeAlign = DL.getABITypeAlign (ArgTy);
1558
+ uint64_t AllocSize = DL.getTypeAllocSize (ArgTy);
1559
+ ExplicitArgOffset = alignTo (ExplicitArgOffset, ABITypeAlign) + AllocSize;
1560
+ if (!PreloadInfo.canPreloadKernArgAtOffset (ExplicitArgOffset))
1561
+ break ;
1562
+
1563
+ Arg.addAttr (Attribute::InReg);
1564
+ NumPreloadedExplicitArgs++;
1565
+ if (NumPreloadsRequested > 0 )
1566
+ NumPreloadsRequested--;
1567
+ }
1327
1568
1328
- Arg.addAttr (Attribute::InReg);
1569
+ // Only try preloading hidden arguments if we can successfully preload the
1570
+ // last explicit argument.
1571
+ if (NumPreloadedExplicitArgs == F->arg_size ()) {
1572
+ uint64_t ImplicitArgsBaseOffset =
1573
+ alignTo (ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr ()) +
1574
+ BaseOffset;
1575
+ PreloadInfo.tryAllocHiddenArgPreloadSGPRs (ImplicitArgsBaseOffset,
1576
+ FunctionsToErase);
1577
+ }
1329
1578
}
1579
+
1580
+ // Erase cloned functions if we needed to update the kernel signature to
1581
+ // support preloading hidden kernel arguments.
1582
+ for (auto *F : FunctionsToErase)
1583
+ F->eraseFromParent ();
1330
1584
}
1331
1585
1332
1586
static bool runImpl (Module &M, AnalysisGetter &AG, TargetMachine &TM,
@@ -1378,8 +1632,6 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
1378
1632
if (!AMDGPU::isEntryFunctionCC (CC)) {
1379
1633
A.getOrCreateAAFor <AAAMDFlatWorkGroupSize>(IRPosition::function (*F));
1380
1634
A.getOrCreateAAFor <AAAMDWavesPerEU>(IRPosition::function (*F));
1381
- } else if (CC == CallingConv::AMDGPU_KERNEL) {
1382
- addPreloadKernArgHint (*F, TM);
1383
1635
}
1384
1636
1385
1637
for (auto &I : instructions (F)) {
@@ -1400,6 +1652,11 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
1400
1652
}
1401
1653
1402
1654
ChangeStatus Change = A.run ();
1655
+
1656
+ // Mark kernel arguments with 'inreg' attribute to indicate that they should
1657
+ // be preloaded into SGPRs.
1658
+ markKernelArgsAsInreg (Functions, TM);
1659
+
1403
1660
return Change == ChangeStatus::CHANGED;
1404
1661
}
1405
1662
0 commit comments