13
13
14
14
#include " AMDGPU.h"
15
15
#include " GCNSubtarget.h"
16
+ #include " llvm/ADT/StringExtras.h"
17
+ #include " llvm/Analysis/ValueTracking.h"
16
18
#include " llvm/CodeGen/TargetPassConfig.h"
17
19
#include " llvm/IR/IRBuilder.h"
18
20
#include " llvm/IR/IntrinsicsAMDGPU.h"
@@ -31,9 +33,110 @@ class PreloadKernelArgInfo {
31
33
const GCNSubtarget &ST;
32
34
unsigned NumFreeUserSGPRs;
33
35
34
- public:
35
- SmallVector<llvm::Metadata *, 8 > KernelArgMetadata;
36
+ enum HiddenArg : unsigned {
37
+ HIDDEN_BLOCK_COUNT_X,
38
+ HIDDEN_BLOCK_COUNT_Y,
39
+ HIDDEN_BLOCK_COUNT_Z,
40
+ HIDDEN_GROUP_SIZE_X,
41
+ HIDDEN_GROUP_SIZE_Y,
42
+ HIDDEN_GROUP_SIZE_Z,
43
+ HIDDEN_REMAINDER_X,
44
+ HIDDEN_REMAINDER_Y,
45
+ HIDDEN_REMAINDER_Z,
46
+ END_HIDDEN_ARGS
47
+ };
48
+
49
+ // Stores information about a specific hidden argument.
50
+ struct HiddenArgInfo {
51
+ // Offset in bytes from the location in the kernearg segment pointed to by
52
+ // the implicitarg pointer.
53
+ uint8_t Offset;
54
+ // The size of the hidden argument in bytes.
55
+ uint8_t Size ;
56
+ // The name of the hidden argument in the kernel signature.
57
+ const char *Name;
58
+ };
59
+
60
+ static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
61
+ {0 , 4 , " _hidden_block_count_x" }, {4 , 4 , " _hidden_block_count_y" },
62
+ {8 , 4 , " _hidden_block_count_z" }, {12 , 2 , " _hidden_group_size_x" },
63
+ {14 , 2 , " _hidden_group_size_y" }, {16 , 2 , " _hidden_group_size_z" },
64
+ {18 , 2 , " _hidden_remainder_x" }, {20 , 2 , " _hidden_remainder_y" },
65
+ {22 , 2 , " _hidden_remainder_z" }};
66
+
67
+ static HiddenArg getHiddenArgFromOffset (unsigned Offset) {
68
+ for (unsigned I = 0 ; I < END_HIDDEN_ARGS; ++I)
69
+ if (HiddenArgs[I].Offset == Offset)
70
+ return static_cast <HiddenArg>(I);
71
+
72
+ return END_HIDDEN_ARGS;
73
+ }
74
+
75
+ static Type *getHiddenArgType (LLVMContext &Ctx, HiddenArg HA) {
76
+ if (HA < END_HIDDEN_ARGS)
77
+ return Type::getIntNTy (Ctx, HiddenArgs[HA].Size * 8 );
78
+
79
+ llvm_unreachable (" Unexpected hidden argument." );
80
+ }
81
+
82
+ static const char *getHiddenArgName (HiddenArg HA) {
83
+ if (HA < END_HIDDEN_ARGS) {
84
+ return HiddenArgs[HA].Name ;
85
+ }
86
+ llvm_unreachable (" Unexpected hidden argument." );
87
+ }
36
88
89
+ // Clones the function after adding implicit arguments to the argument list
90
+ // and returns the new updated function. Preloaded implicit arguments are
91
+ // added up to and including the last one that will be preloaded, indicated by
92
+ // LastPreloadIndex. Currently preloading is only performed on the totality of
93
+ // sequential data from the kernarg segment including implicit (hidden)
94
+ // arguments. This means that all arguments up to the last preloaded argument
95
+ // will also be preloaded even if that data is unused.
96
+ Function *cloneFunctionWithPreloadImplicitArgs (unsigned LastPreloadIndex) {
97
+ FunctionType *FT = F.getFunctionType ();
98
+ LLVMContext &Ctx = F.getParent ()->getContext ();
99
+ SmallVector<Type *, 16 > FTypes (FT->param_begin (), FT->param_end ());
100
+ for (unsigned I = 0 ; I <= LastPreloadIndex; ++I)
101
+ FTypes.push_back (getHiddenArgType (Ctx, HiddenArg (I)));
102
+
103
+ FunctionType *NFT =
104
+ FunctionType::get (FT->getReturnType (), FTypes, FT->isVarArg ());
105
+ Function *NF =
106
+ Function::Create (NFT, F.getLinkage (), F.getAddressSpace (), F.getName ());
107
+
108
+ NF->copyAttributesFrom (&F);
109
+ NF->copyMetadata (&F, 0 );
110
+ NF->setIsNewDbgInfoFormat (F.IsNewDbgInfoFormat );
111
+
112
+ F.getParent ()->getFunctionList ().insert (F.getIterator (), NF);
113
+ NF->takeName (&F);
114
+ NF->splice (NF->begin (), &F);
115
+
116
+ Function::arg_iterator NFArg = NF->arg_begin ();
117
+ for (Argument &Arg : F.args ()) {
118
+ Arg.replaceAllUsesWith (&*NFArg);
119
+ NFArg->takeName (&Arg);
120
+ ++NFArg;
121
+ }
122
+
123
+ AttrBuilder AB (Ctx);
124
+ AB.addAttribute (Attribute::InReg);
125
+ AB.addAttribute (" amdgpu-hidden-argument" );
126
+ AttributeList AL = NF->getAttributes ();
127
+ for (unsigned I = 0 ; I <= LastPreloadIndex; ++I) {
128
+ AL = AL.addParamAttributes (Ctx, NFArg->getArgNo (), AB);
129
+ NFArg++->setName (getHiddenArgName (HiddenArg (I)));
130
+ }
131
+
132
+ NF->setAttributes (AL);
133
+ F.replaceAllUsesWith (NF);
134
+ F.setCallingConv (CallingConv::C);
135
+
136
+ return NF;
137
+ }
138
+
139
+ public:
37
140
PreloadKernelArgInfo (Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
38
141
setInitialFreeUserSGPRsCount ();
39
142
}
@@ -64,6 +167,87 @@ class PreloadKernelArgInfo {
64
167
NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
65
168
return true ;
66
169
}
170
+
171
+ // Try to allocate SGPRs to preload implicit kernel arguments.
172
+ void tryAllocImplicitArgPreloadSGPRs (uint64_t ImplicitArgsBaseOffset,
173
+ IRBuilder<> &Builder) {
174
+ StringRef Name = Intrinsic::getName (Intrinsic::amdgcn_implicitarg_ptr);
175
+ Function *ImplicitArgPtr = F.getParent ()->getFunction (Name);
176
+ if (!ImplicitArgPtr)
177
+ return ;
178
+
179
+ const DataLayout &DL = F.getParent ()->getDataLayout ();
180
+ // Pair is the load and the load offset.
181
+ SmallVector<std::pair<LoadInst *, unsigned >, 4 > ImplicitArgLoads;
182
+ for (auto *U : ImplicitArgPtr->users ()) {
183
+ Instruction *CI = dyn_cast<Instruction>(U);
184
+ if (!CI || CI->getParent ()->getParent () != &F)
185
+ continue ;
186
+
187
+ for (auto *U : CI->users ()) {
188
+ int64_t Offset = 0 ;
189
+ auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
190
+ if (!Load) {
191
+ if (GetPointerBaseWithConstantOffset (U, Offset, DL) != CI)
192
+ continue ;
193
+
194
+ Load = dyn_cast<LoadInst>(*U->user_begin ()); // Load from GEP?
195
+ }
196
+
197
+ if (!Load || !Load->isSimple ())
198
+ continue ;
199
+
200
+ // FIXME: Expand to handle 64-bit implicit args and large merged loads.
201
+ LLVMContext &Ctx = F.getParent ()->getContext ();
202
+ Type *LoadTy = Load->getType ();
203
+ HiddenArg HA = getHiddenArgFromOffset (Offset);
204
+ if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType (Ctx, HA))
205
+ continue ;
206
+
207
+ ImplicitArgLoads.push_back (std::make_pair (Load, Offset));
208
+ }
209
+ }
210
+
211
+ if (ImplicitArgLoads.empty ())
212
+ return ;
213
+
214
+ // Allocate loads in order of offset. We need to be sure that the implicit
215
+ // argument can actually be preloaded.
216
+ std::sort (ImplicitArgLoads.begin (), ImplicitArgLoads.end (), less_second ());
217
+
218
+ uint64_t LastExplicitArgOffset = ImplicitArgsBaseOffset;
219
+ // If we fail to preload any implicit argument we know we don't have SGPRs
220
+ // to preload any subsequent ones with larger offsets. Find the first
221
+ // argument that we cannot preload.
222
+ auto *PreloadEnd = std::find_if (
223
+ ImplicitArgLoads.begin (), ImplicitArgLoads.end (),
224
+ [&](const std::pair<LoadInst *, unsigned > &Load) {
225
+ unsigned LoadSize = DL.getTypeStoreSize (Load.first ->getType ());
226
+ unsigned LoadOffset = Load.second ;
227
+ if (!tryAllocPreloadSGPRs (LoadSize,
228
+ LoadOffset + ImplicitArgsBaseOffset,
229
+ LastExplicitArgOffset))
230
+ return true ;
231
+
232
+ LastExplicitArgOffset = LoadOffset + LoadSize;
233
+ return false ;
234
+ });
235
+
236
+ if (PreloadEnd == ImplicitArgLoads.begin ())
237
+ return ;
238
+
239
+ unsigned LastHiddenArgIndex = getHiddenArgFromOffset (PreloadEnd[-1 ].second );
240
+ Function *NF = cloneFunctionWithPreloadImplicitArgs (LastHiddenArgIndex);
241
+ assert (NF);
242
+ for (const auto *I = ImplicitArgLoads.begin (); I != PreloadEnd; ++I) {
243
+ LoadInst *LoadInst = I->first ;
244
+ unsigned LoadOffset = I->second ;
245
+ unsigned HiddenArgIndex = getHiddenArgFromOffset (LoadOffset);
246
+ unsigned Index = NF->arg_size () - LastHiddenArgIndex + HiddenArgIndex - 1 ;
247
+ Argument *Arg = NF->getArg (Index);
248
+ LoadInst->replaceAllUsesWith (Arg);
249
+ }
250
+ }
67
251
};
68
252
69
253
class AMDGPULowerKernelArguments : public FunctionPass {
@@ -142,6 +326,12 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
142
326
uint64_t LastExplicitArgOffset = ExplicitArgOffset;
143
327
ExplicitArgOffset = alignTo (ExplicitArgOffset, ABITypeAlign) + AllocSize;
144
328
329
+ // Guard against the situation where hidden arguments have already been
330
+ // lowered and added to the kernel function signiture, i.e. in a situation
331
+ // where this pass has run twice.
332
+ if (Arg.hasAttribute (" amdgpu-hidden-argument" ))
333
+ break ;
334
+
145
335
// Try to preload this argument into user SGPRs.
146
336
if (Arg.hasInRegAttr () && InPreloadSequence && ST.hasKernargPreload () &&
147
337
!Arg.getType ()->isAggregateType ())
@@ -281,6 +471,14 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
281
471
KernArgSegment->addRetAttr (
282
472
Attribute::getWithAlignment (Ctx, std::max (KernArgBaseAlign, MaxAlign)));
283
473
474
+ if (InPreloadSequence) {
475
+ uint64_t ImplicitArgsBaseOffset =
476
+ alignTo (ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr ()) +
477
+ BaseOffset;
478
+ PreloadInfo.tryAllocImplicitArgPreloadSGPRs (ImplicitArgsBaseOffset,
479
+ Builder);
480
+ }
481
+
284
482
return true ;
285
483
}
286
484
0 commit comments