13
13
14
14
#include " AMDGPU.h"
15
15
#include " GCNSubtarget.h"
16
+ #include " llvm/ADT/StringExtras.h"
17
+ #include " llvm/Analysis/ValueTracking.h"
16
18
#include " llvm/CodeGen/TargetPassConfig.h"
17
19
#include " llvm/IR/IRBuilder.h"
18
20
#include " llvm/IR/IntrinsicsAMDGPU.h"
@@ -31,9 +33,97 @@ class PreloadKernelArgInfo {
31
33
const GCNSubtarget &ST;
32
34
unsigned NumFreeUserSGPRs;
33
35
34
- public:
35
- SmallVector<llvm::Metadata *, 8 > KernelArgMetadata;
36
+ enum HiddenArg : unsigned {
37
+ HIDDEN_BLOCK_COUNT_X,
38
+ HIDDEN_BLOCK_COUNT_Y,
39
+ HIDDEN_BLOCK_COUNT_Z,
40
+ HIDDEN_GROUP_SIZE_X,
41
+ HIDDEN_GROUP_SIZE_Y,
42
+ HIDDEN_GROUP_SIZE_Z,
43
+ HIDDEN_REMAINDER_X,
44
+ HIDDEN_REMAINDER_Y,
45
+ HIDDEN_REMAINDER_Z,
46
+ END_HIDDEN_ARGS
47
+ };
48
+
49
+ struct HiddenArgInfo {
50
+ uint8_t Offset;
51
+ uint8_t Size ;
52
+ const char *Name;
53
+ };
54
+
55
+ static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
56
+ {0 , 4 , " _hidden_block_count_x" }, {4 , 4 , " _hidden_block_count_y" },
57
+ {8 , 4 , " _hidden_block_count_z" }, {12 , 2 , " _hidden_group_size_x" },
58
+ {14 , 2 , " _hidden_group_size_y" }, {16 , 2 , " _hidden_group_size_z" },
59
+ {18 , 2 , " _hidden_remainder_x" }, {20 , 2 , " _hidden_remainder_y" },
60
+ {22 , 2 , " _hidden_remainder_z" }};
61
+
62
+ static HiddenArg getHiddenArgIndexFromOffset (unsigned Offset) {
63
+ for (unsigned I = 0 ; I < END_HIDDEN_ARGS; ++I)
64
+ if (HiddenArgs[I].Offset == Offset)
65
+ return static_cast <HiddenArg>(I);
66
+
67
+ llvm_unreachable (" Unexpected hidden argument offset." );
68
+ }
69
+
70
+ static Type *getHiddenArgType (LLVMContext &Ctx, HiddenArg HA) {
71
+ if (HA < END_HIDDEN_ARGS)
72
+ return Type::getIntNTy (Ctx, HiddenArgs[HA].Size * 8 );
73
+
74
+ llvm_unreachable (" Unexpected hidden argument." );
75
+ }
76
+
77
+ static const char *getHiddenArgName (HiddenArg HA) {
78
+ if (HA < END_HIDDEN_ARGS) {
79
+ return HiddenArgs[HA].Name ;
80
+ }
81
+ llvm_unreachable (" Unexpected hidden argument." );
82
+ }
83
+
84
+ Function *cloneFunctionWithPreloadImplicitArgs (unsigned LastPreloadIndex) {
85
+ FunctionType *FT = F.getFunctionType ();
86
+ LLVMContext &Ctx = F.getParent ()->getContext ();
87
+ SmallVector<Type *, 16 > FTypes (FT->param_begin (), FT->param_end ());
88
+ for (unsigned I = 0 ; I <= LastPreloadIndex; ++I)
89
+ FTypes.push_back (getHiddenArgType (Ctx, HiddenArg (I)));
90
+
91
+ FunctionType *NFT =
92
+ FunctionType::get (FT->getReturnType (), FTypes, FT->isVarArg ());
93
+ Function *NF =
94
+ Function::Create (NFT, F.getLinkage (), F.getAddressSpace (), F.getName ());
95
+
96
+ NF->copyAttributesFrom (&F);
97
+ NF->copyMetadata (&F, 0 );
98
+ NF->setIsNewDbgInfoFormat (F.IsNewDbgInfoFormat );
99
+
100
+ F.getParent ()->getFunctionList ().insert (F.getIterator (), NF);
101
+ NF->takeName (&F);
102
+ NF->splice (NF->begin (), &F);
103
+
104
+ Function::arg_iterator NFArg = NF->arg_begin ();
105
+ for (Argument &Arg : F.args ()) {
106
+ Arg.replaceAllUsesWith (&*NFArg);
107
+ NFArg->takeName (&Arg);
108
+ ++NFArg;
109
+ }
110
+
111
+ AttrBuilder AB (Ctx);
112
+ AB.addAttribute (Attribute::InReg);
113
+ AB.addAttribute (" amdgpu-hidden-argument" );
114
+ AttributeList AL = NF->getAttributes ();
115
+ for (unsigned I = 0 ; I <= LastPreloadIndex; ++I) {
116
+ AL = AL.addParamAttributes (Ctx, NFArg->getArgNo (), AB);
117
+ NFArg++->setName (getHiddenArgName (HiddenArg (I)));
118
+ }
119
+
120
+ NF->setAttributes (AL);
121
+ F.replaceAllUsesWith (NF);
122
+
123
+ return NF;
124
+ }
36
125
126
+ public:
37
127
PreloadKernelArgInfo (Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
38
128
setInitialFreeUserSGPRsCount ();
39
129
}
@@ -64,6 +154,89 @@ class PreloadKernelArgInfo {
64
154
NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
65
155
return true ;
66
156
}
157
+
158
+ // Try to allocate SGPRs to preload implicit kernel arguments.
159
+ void tryAllocImplicitArgPreloadSGPRs (uint64_t ImplicitArgsBaseOffset,
160
+ IRBuilder<> &Builder) {
161
+ StringRef Name = Intrinsic::getName (Intrinsic::amdgcn_implicitarg_ptr);
162
+ Function *ImplicitArgPtr = F.getParent ()->getFunction (Name);
163
+ if (!ImplicitArgPtr)
164
+ return ;
165
+
166
+ const DataLayout &DL = F.getParent ()->getDataLayout ();
167
+ // Pair is the load and the load offset.
168
+ SmallVector<std::pair<LoadInst *, unsigned >, 4 > ImplicitArgLoads;
169
+ for (auto *U : ImplicitArgPtr->users ()) {
170
+ Instruction *CI = dyn_cast<Instruction>(U);
171
+ if (!CI || CI->getParent ()->getParent () != &F)
172
+ continue ;
173
+
174
+ for (auto *U : CI->users ()) {
175
+ int64_t Offset = 0 ;
176
+ auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
177
+ if (!Load) {
178
+ if (GetPointerBaseWithConstantOffset (U, Offset, DL) != CI)
179
+ continue ;
180
+
181
+ Load = dyn_cast<LoadInst>(*U->user_begin ()); // Load from GEP?
182
+ }
183
+
184
+ if (!Load || !Load->isSimple ())
185
+ continue ;
186
+
187
+ // FIXME: Expand to handle 64-bit implicit args and large merged loads.
188
+ unsigned LoadSize = Load->getType ()->getScalarSizeInBits ();
189
+ if (LoadSize != 32 && LoadSize != 16 )
190
+ continue ;
191
+
192
+ ImplicitArgLoads.push_back (std::make_pair (Load, Offset));
193
+ }
194
+ }
195
+
196
+ if (ImplicitArgLoads.empty ())
197
+ return ;
198
+
199
+ // Allocate loads in order of offset. We need to be sure that the implicit
200
+ // argument can actually be preloaded.
201
+ std::sort (ImplicitArgLoads.begin (), ImplicitArgLoads.end (),
202
+ [](const std::pair<LoadInst *, unsigned > &A,
203
+ const std::pair<LoadInst *, unsigned > &B) {
204
+ return A.second < B.second ;
205
+ });
206
+
207
+ uint64_t LastExplicitArgOffset = ImplicitArgsBaseOffset;
208
+ // If we fail to preload any implicit argument we know we don't have SGPRs
209
+ // to preload any subsequent ones with larger offsets. Find the first
210
+ // argument that we cannot preload.
211
+ auto *PreloadEnd = std::find_if (
212
+ ImplicitArgLoads.begin (), ImplicitArgLoads.end (),
213
+ [&](const std::pair<LoadInst *, unsigned > &Load) {
214
+ unsigned LoadSize = DL.getTypeStoreSize (Load.first ->getType ());
215
+ unsigned LoadOffset = Load.second ;
216
+ if (!tryAllocPreloadSGPRs (LoadSize,
217
+ LoadOffset + ImplicitArgsBaseOffset,
218
+ LastExplicitArgOffset))
219
+ return true ;
220
+
221
+ LastExplicitArgOffset = LoadOffset + LoadSize;
222
+ return false ;
223
+ });
224
+
225
+ if (PreloadEnd == ImplicitArgLoads.begin ())
226
+ return ;
227
+
228
+ unsigned LastHiddenArgIndex = getHiddenArgIndexFromOffset (PreloadEnd[-1 ].second );
229
+ Function *NF = cloneFunctionWithPreloadImplicitArgs (LastHiddenArgIndex);
230
+ assert (NF);
231
+ for (const auto *I = ImplicitArgLoads.begin (); I != PreloadEnd; ++I) {
232
+ LoadInst *LoadInst = I->first ;
233
+ unsigned LoadOffset = I->second ;
234
+ unsigned HiddenArgIndex = getHiddenArgIndexFromOffset (LoadOffset);
235
+ unsigned Index = NF->arg_size () - LastHiddenArgIndex + HiddenArgIndex - 1 ;
236
+ Argument *Arg = NF->getArg (Index);
237
+ LoadInst->replaceAllUsesWith (Arg);
238
+ }
239
+ }
67
240
};
68
241
69
242
class AMDGPULowerKernelArguments : public FunctionPass {
@@ -281,6 +454,14 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
281
454
KernArgSegment->addRetAttr (
282
455
Attribute::getWithAlignment (Ctx, std::max (KernArgBaseAlign, MaxAlign)));
283
456
457
+ if (InPreloadSequence) {
458
+ uint64_t ImplicitArgsBaseOffset =
459
+ alignTo (ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr ()) +
460
+ BaseOffset;
461
+ PreloadInfo.tryAllocImplicitArgPreloadSGPRs (ImplicitArgsBaseOffset,
462
+ Builder);
463
+ }
464
+
284
465
return true ;
285
466
}
286
467
0 commit comments