diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 0b8f2e4f96715..70dc200693342 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -1639,6 +1639,10 @@ The AMDGPU backend supports the following LLVM IR attributes. function which requires AGPRs is reached through any function marked with this attribute. + "amdgpu-hidden-argument" This attribute is used internally by the backend to mark function arguments + as hidden. Hidden arguments are managed by the compiler and are not part of + the explicit arguments supplied by the user. + ======================================= ========================================================== Calling Conventions @@ -5856,6 +5860,12 @@ may insert a trap instruction at the start of the kernel prologue to manage situations where kernarg preloading is attempted on hardware with incompatible firmware. +With code object V5 and later, hidden kernel arguments that are normally +accessed through the Implicit Argument Ptr, may be preloaded into User SGPRs. +These arguments are added to the kernel function signature and are marked with +the attributes "inreg" and "amdgpu-hidden-argument". (See +:ref:`amdgpu-llvm-ir-attributes-table`). + .. _amdgpu-amdhsa-kernel-prolog: Kernel Prolog diff --git a/llvm/include/llvm/IR/Argument.h b/llvm/include/llvm/IR/Argument.h index 3349f1306970e..0ffcb05519d44 100644 --- a/llvm/include/llvm/IR/Argument.h +++ b/llvm/include/llvm/IR/Argument.h @@ -178,6 +178,8 @@ class Argument final : public Value { /// Check if an argument has a given attribute. bool hasAttribute(Attribute::AttrKind Kind) const; + bool hasAttribute(StringRef Kind) const; + Attribute getAttribute(Attribute::AttrKind Kind) const; /// Method for support type inquiry through isa, cast, and dyn_cast. diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h index fec876eaafc86..43bf36d6f1eec 100644 --- a/llvm/include/llvm/IR/Function.h +++ b/llvm/include/llvm/IR/Function.h @@ -433,6 +433,9 @@ class LLVM_ABI Function : public GlobalObject, public ilist_node { /// check if an attributes is in the list of attributes. bool hasParamAttribute(unsigned ArgNo, Attribute::AttrKind Kind) const; + /// Check if an attribute is in the list of attributes. + bool hasParamAttribute(unsigned ArgNo, StringRef Kind) const; + /// gets the attribute from the list of attributes. Attribute getAttributeAtIndex(unsigned i, Attribute::AttrKind Kind) const; diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index e4786e0bc6032..09b90713b9c79 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -351,6 +351,10 @@ bool Argument::hasAttribute(Attribute::AttrKind Kind) const { return getParent()->hasParamAttribute(getArgNo(), Kind); } +bool Argument::hasAttribute(StringRef Kind) const { + return getParent()->hasParamAttribute(getArgNo(), Kind); +} + Attribute Argument::getAttribute(Attribute::AttrKind Kind) const { return getParent()->getParamAttribute(getArgNo(), Kind); } @@ -738,6 +742,10 @@ bool Function::hasParamAttribute(unsigned ArgNo, return AttributeSets.hasParamAttr(ArgNo, Kind); } +bool Function::hasParamAttribute(unsigned ArgNo, StringRef Kind) const { + return AttributeSets.hasParamAttr(ArgNo, Kind); +} + Attribute Function::getAttributeAtIndex(unsigned i, Attribute::AttrKind Kind) const { return AttributeSets.getAttributeAtIndex(i, Kind); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index b67d78e450bb8..bd418efcb83cb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -260,8 +260,12 @@ void MetadataStreamerMsgPackV4::emitKernelArgs(const MachineFunction &MF, auto &Func = MF.getFunction(); unsigned Offset = 0; auto Args = HSAMetadataDoc->getArrayNode(); - for (auto &Arg : Func.args()) + for (auto &Arg : Func.args()) { + if (Arg.hasAttribute("amdgpu-hidden-argument")) + continue; + emitKernelArg(Arg, Offset, Args); + } emitHiddenKernelArgs(MF, Offset, Args); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index 83cce6021693a..d16c96f88e7b1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -13,6 +13,8 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicsAMDGPU.h" @@ -31,9 +33,110 @@ class PreloadKernelArgInfo { const GCNSubtarget &ST; unsigned NumFreeUserSGPRs; -public: - SmallVector KernelArgMetadata; + enum HiddenArg : unsigned { + HIDDEN_BLOCK_COUNT_X, + HIDDEN_BLOCK_COUNT_Y, + HIDDEN_BLOCK_COUNT_Z, + HIDDEN_GROUP_SIZE_X, + HIDDEN_GROUP_SIZE_Y, + HIDDEN_GROUP_SIZE_Z, + HIDDEN_REMAINDER_X, + HIDDEN_REMAINDER_Y, + HIDDEN_REMAINDER_Z, + END_HIDDEN_ARGS + }; + + // Stores information about a specific hidden argument. + struct HiddenArgInfo { + // Offset in bytes from the location in the kernearg segment pointed to by + // the implicitarg pointer. + uint8_t Offset; + // The size of the hidden argument in bytes. + uint8_t Size; + // The name of the hidden argument in the kernel signature. + const char *Name; + }; + + static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = { + {0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"}, + {8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"}, + {14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"}, + {18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"}, + {22, 2, "_hidden_remainder_z"}}; + + static HiddenArg getHiddenArgFromOffset(unsigned Offset) { + for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I) + if (HiddenArgs[I].Offset == Offset) + return static_cast(I); + + return END_HIDDEN_ARGS; + } + + static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) { + if (HA < END_HIDDEN_ARGS) + return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8); + + llvm_unreachable("Unexpected hidden argument."); + } + + static const char *getHiddenArgName(HiddenArg HA) { + if (HA < END_HIDDEN_ARGS) { + return HiddenArgs[HA].Name; + } + llvm_unreachable("Unexpected hidden argument."); + } + // Clones the function after adding implicit arguments to the argument list + // and returns the new updated function. Preloaded implicit arguments are + // added up to and including the last one that will be preloaded, indicated by + // LastPreloadIndex. Currently preloading is only performed on the totality of + // sequential data from the kernarg segment including implicit (hidden) + // arguments. This means that all arguments up to the last preloaded argument + // will also be preloaded even if that data is unused. + Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) { + FunctionType *FT = F.getFunctionType(); + LLVMContext &Ctx = F.getParent()->getContext(); + SmallVector FTypes(FT->param_begin(), FT->param_end()); + for (unsigned I = 0; I <= LastPreloadIndex; ++I) + FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I))); + + FunctionType *NFT = + FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg()); + Function *NF = + Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName()); + + NF->copyAttributesFrom(&F); + NF->copyMetadata(&F, 0); + NF->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat); + + F.getParent()->getFunctionList().insert(F.getIterator(), NF); + NF->takeName(&F); + NF->splice(NF->begin(), &F); + + Function::arg_iterator NFArg = NF->arg_begin(); + for (Argument &Arg : F.args()) { + Arg.replaceAllUsesWith(&*NFArg); + NFArg->takeName(&Arg); + ++NFArg; + } + + AttrBuilder AB(Ctx); + AB.addAttribute(Attribute::InReg); + AB.addAttribute("amdgpu-hidden-argument"); + AttributeList AL = NF->getAttributes(); + for (unsigned I = 0; I <= LastPreloadIndex; ++I) { + AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB); + NFArg++->setName(getHiddenArgName(HiddenArg(I))); + } + + NF->setAttributes(AL); + F.replaceAllUsesWith(NF); + F.setCallingConv(CallingConv::C); + + return NF; + } + +public: PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) { setInitialFreeUserSGPRsCount(); } @@ -64,6 +167,87 @@ class PreloadKernelArgInfo { NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs); return true; } + + // Try to allocate SGPRs to preload implicit kernel arguments. + void tryAllocImplicitArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset, + IRBuilder<> &Builder) { + StringRef Name = Intrinsic::getName(Intrinsic::amdgcn_implicitarg_ptr); + Function *ImplicitArgPtr = F.getParent()->getFunction(Name); + if (!ImplicitArgPtr) + return; + + const DataLayout &DL = F.getParent()->getDataLayout(); + // Pair is the load and the load offset. + SmallVector, 4> ImplicitArgLoads; + for (auto *U : ImplicitArgPtr->users()) { + Instruction *CI = dyn_cast(U); + if (!CI || CI->getParent()->getParent() != &F) + continue; + + for (auto *U : CI->users()) { + int64_t Offset = 0; + auto *Load = dyn_cast(U); // Load from ImplicitArgPtr? + if (!Load) { + if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI) + continue; + + Load = dyn_cast(*U->user_begin()); // Load from GEP? + } + + if (!Load || !Load->isSimple()) + continue; + + // FIXME: Expand to handle 64-bit implicit args and large merged loads. + LLVMContext &Ctx = F.getParent()->getContext(); + Type *LoadTy = Load->getType(); + HiddenArg HA = getHiddenArgFromOffset(Offset); + if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA)) + continue; + + ImplicitArgLoads.push_back(std::make_pair(Load, Offset)); + } + } + + if (ImplicitArgLoads.empty()) + return; + + // Allocate loads in order of offset. We need to be sure that the implicit + // argument can actually be preloaded. + std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second()); + + uint64_t LastExplicitArgOffset = ImplicitArgsBaseOffset; + // If we fail to preload any implicit argument we know we don't have SGPRs + // to preload any subsequent ones with larger offsets. Find the first + // argument that we cannot preload. + auto *PreloadEnd = std::find_if( + ImplicitArgLoads.begin(), ImplicitArgLoads.end(), + [&](const std::pair &Load) { + unsigned LoadSize = DL.getTypeStoreSize(Load.first->getType()); + unsigned LoadOffset = Load.second; + if (!tryAllocPreloadSGPRs(LoadSize, + LoadOffset + ImplicitArgsBaseOffset, + LastExplicitArgOffset)) + return true; + + LastExplicitArgOffset = LoadOffset + LoadSize; + return false; + }); + + if (PreloadEnd == ImplicitArgLoads.begin()) + return; + + unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second); + Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex); + assert(NF); + for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) { + LoadInst *LoadInst = I->first; + unsigned LoadOffset = I->second; + unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset); + unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1; + Argument *Arg = NF->getArg(Index); + LoadInst->replaceAllUsesWith(Arg); + } + } }; class AMDGPULowerKernelArguments : public FunctionPass { @@ -142,6 +326,12 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { uint64_t LastExplicitArgOffset = ExplicitArgOffset; ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize; + // Guard against the situation where hidden arguments have already been + // lowered and added to the kernel function signiture, i.e. in a situation + // where this pass has run twice. + if (Arg.hasAttribute("amdgpu-hidden-argument")) + break; + // Try to preload this argument into user SGPRs. if (Arg.hasInRegAttr() && InPreloadSequence && ST.hasKernargPreload() && !Arg.getType()->isAggregateType()) @@ -281,6 +471,14 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { KernArgSegment->addRetAttr( Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign))); + if (InPreloadSequence) { + uint64_t ImplicitArgsBaseOffset = + alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) + + BaseOffset; + PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset, + Builder); + } + return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 9809a289df093..961a9220b48d6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -314,6 +314,9 @@ uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, MaxAlign = Align(1); for (const Argument &Arg : F.args()) { + if (Arg.hasAttribute("amdgpu-hidden-argument")) + continue; + const bool IsByRef = Arg.hasByRefAttr(); Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); Align Alignment = DL.getValueOrABITypeAlignment( diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index f34325e1a5afa..ccd8822570f63 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2510,24 +2510,25 @@ void SITargetLowering::allocatePreloadKernArgSGPRs( const SmallVectorImpl &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { Function &F = MF.getFunction(); - unsigned LastExplicitArgOffset = - MF.getSubtarget().getExplicitKernelArgOffset(); + unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset(); GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo(); bool InPreloadSequence = true; unsigned InIdx = 0; + bool AlignedForImplictArgs = false; + unsigned ImplicitArgOffset = 0; for (auto &Arg : F.args()) { if (!InPreloadSequence || !Arg.hasInRegAttr()) break; - int ArgIdx = Arg.getArgNo(); + unsigned ArgIdx = Arg.getArgNo(); // Don't preload non-original args or parts not in the current preload // sequence. - if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() || - (int)Ins[InIdx].getOrigArgIndex() != ArgIdx)) + if (InIdx < Ins.size() && + (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx)) break; for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() && - (int)Ins[InIdx].getOrigArgIndex() == ArgIdx; + Ins[InIdx].getOrigArgIndex() == ArgIdx; InIdx++) { assert(ArgLocs[ArgIdx].isMemLoc()); auto &ArgLoc = ArgLocs[InIdx]; @@ -2537,6 +2538,18 @@ void SITargetLowering::allocatePreloadKernArgSGPRs( unsigned NumAllocSGPRs = alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32; + // Fix alignment for hidden arguments. + if (Arg.hasAttribute("amdgpu-hidden-argument")) { + if (!AlignedForImplictArgs) { + ImplicitArgOffset = + alignTo(LastExplicitArgOffset, + Subtarget->getAlignmentForImplicitArgPtr()) - + LastExplicitArgOffset; + AlignedForImplictArgs = true; + } + ArgOffset += ImplicitArgOffset; + } + // Arg is preloaded into the previous SGPR. if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) { Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back( diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll new file mode 100644 index 0000000000000..aeb7faade4715 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll @@ -0,0 +1,222 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -S < %s | FileCheck -check-prefix=NO-PRELOAD %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=16 -S < %s | FileCheck -check-prefix=PRELOAD %s + +define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define amdgpu_kernel void @preload_block_count_x( +; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; NO-PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0:![0-9]+]] +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define amdgpu_kernel void @preload_block_count_x( +; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0:[0-9]+]] { +; PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 +; PRELOAD-NEXT: store i32 [[_HIDDEN_BLOCK_COUNT_X]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %load = load i32, ptr addrspace(4) %imp_arg_ptr + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, i512) { +; NO-PRELOAD-LABEL: define amdgpu_kernel void @no_free_sgprs_block_count_x( +; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]], i512 [[TMP0:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(328) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define amdgpu_kernel void @no_free_sgprs_block_count_x( +; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i512 inreg [[TMP0:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(328) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 +; PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %load = load i32, ptr addrspace(4) %imp_arg_ptr + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define amdgpu_kernel void @preloadremainder_z( +; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PRELOADREMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOADREMAINDER_Z_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 +; NO-PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[LOAD]] to i32 +; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define amdgpu_kernel void @preloadremainder_z( +; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Z:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[PRELOADREMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22 +; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 +; PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[_HIDDEN_REMAINDER_Z]] to i32 +; PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 + %load = load i16, ptr addrspace(4) %gep + %conv = zext i16 %load to i32 + store i32 %conv, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define amdgpu_kernel void @preload_workgroup_size_xyz( +; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PRELOAD_WORKGROUP_SIZE_XYZ_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_WORKGROUP_SIZE_XYZ_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 12 +; NO-PRELOAD-NEXT: [[LOAD_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 2 +; NO-PRELOAD-NEXT: [[CONV_X:%.*]] = zext i16 [[LOAD_X]] to i32 +; NO-PRELOAD-NEXT: [[GEP_Y:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 14 +; NO-PRELOAD-NEXT: [[LOAD_Y:%.*]] = load i16, ptr addrspace(4) [[GEP_Y]], align 2 +; NO-PRELOAD-NEXT: [[CONV_Y:%.*]] = zext i16 [[LOAD_Y]] to i32 +; NO-PRELOAD-NEXT: [[GEP_Z:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 16 +; NO-PRELOAD-NEXT: [[LOAD_Z:%.*]] = load i16, ptr addrspace(4) [[GEP_Z]], align 2 +; NO-PRELOAD-NEXT: [[CONV_Z:%.*]] = zext i16 [[LOAD_Z]] to i32 +; NO-PRELOAD-NEXT: [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[CONV_X]], i32 0 +; NO-PRELOAD-NEXT: [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[CONV_Y]], i32 1 +; NO-PRELOAD-NEXT: [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[CONV_Z]], i32 2 +; NO-PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT_LOAD]], align 16 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define amdgpu_kernel void @preload_workgroup_size_xyz( +; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[PRELOAD_WORKGROUP_SIZE_XYZ_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 12 +; PRELOAD-NEXT: [[LOAD_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 2 +; PRELOAD-NEXT: [[CONV_X:%.*]] = zext i16 [[_HIDDEN_GROUP_SIZE_X]] to i32 +; PRELOAD-NEXT: [[GEP_Y:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 14 +; PRELOAD-NEXT: [[LOAD_Y:%.*]] = load i16, ptr addrspace(4) [[GEP_Y]], align 2 +; PRELOAD-NEXT: [[CONV_Y:%.*]] = zext i16 [[_HIDDEN_GROUP_SIZE_Y]] to i32 +; PRELOAD-NEXT: [[GEP_Z:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 16 +; PRELOAD-NEXT: [[LOAD_Z:%.*]] = load i16, ptr addrspace(4) [[GEP_Z]], align 2 +; PRELOAD-NEXT: [[CONV_Z:%.*]] = zext i16 [[_HIDDEN_GROUP_SIZE_Z]] to i32 +; PRELOAD-NEXT: [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[CONV_X]], i32 0 +; PRELOAD-NEXT: [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[CONV_Y]], i32 1 +; PRELOAD-NEXT: [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[CONV_Z]], i32 2 +; PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT]], align 16 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12 + %load_x = load i16, ptr addrspace(4) %gep_x + %conv_x = zext i16 %load_x to i32 + %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14 + %load_y = load i16, ptr addrspace(4) %gep_y + %conv_y = zext i16 %load_y to i32 + %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16 + %load_z = load i16, ptr addrspace(4) %gep_z + %conv_z = zext i16 %load_z to i32 + %ins.0 = insertelement <3 x i32> poison, i32 %conv_x, i32 0 + %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1 + %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2 + store <3 x i32> %ins.2, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inreg %out) { +; NO-PRELOAD-LABEL: define amdgpu_kernel void @incorrect_type_i64_block_count_x( +; NO-PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[INCORRECT_TYPE_I64_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(4) [[IMP_ARG_PTR]], align 8 +; NO-PRELOAD-NEXT: store i64 [[LOAD]], ptr addrspace(1) [[OUT]], align 8 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define amdgpu_kernel void @incorrect_type_i64_block_count_x( +; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[INCORRECT_TYPE_I64_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(4) [[IMP_ARG_PTR]], align 8 +; PRELOAD-NEXT: store i64 [[LOAD]], ptr addrspace(1) [[OUT]], align 8 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %load = load i64, ptr addrspace(4) %imp_arg_ptr + store i64 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) { +; NO-PRELOAD-LABEL: define amdgpu_kernel void @random_incorrect_offset( +; NO-PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[RANDOM_INCORRECT_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 2 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define amdgpu_kernel void @random_incorrect_offset( +; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[RANDOM_INCORRECT_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 2 +; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4 +; PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2 + %load = load i32, ptr addrspace(4) %gep + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @incompatible_attribute_block_count_x(ptr addrspace(1) byref(i32) %out) { +; NO-PRELOAD-LABEL: define amdgpu_kernel void @incompatible_attribute_block_count_x( +; NO-PRELOAD-SAME: ptr addrspace(1) byref(i32) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[OUT_BYVAL_KERNARG_OFFSET]] to ptr addrspace(1) +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[TMP1]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define amdgpu_kernel void @incompatible_attribute_block_count_x( +; PRELOAD-SAME: ptr addrspace(1) byref(i32) [[OUT:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-NEXT: [[OUT_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0 +; PRELOAD-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[OUT_BYVAL_KERNARG_OFFSET]] to ptr addrspace(1) +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 +; PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[TMP1]], align 4 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %load = load i32, ptr addrspace(4) %imp_arg_ptr + store i32 %load, ptr addrspace(1) %out + ret void +} + +;. +; NO-PRELOAD: [[META0]] = !{} +;. diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll new file mode 100644 index 0000000000000..b94c0cd8f4c89 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll @@ -0,0 +1,629 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90a %s + +define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0 { +; GFX940-LABEL: preload_block_count_x: +; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-NEXT: ; %bb.0: +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX90a-LABEL: preload_block_count_x: +; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-NEXT: ; %bb.0: +; GFX90a-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: s_endpgm + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %load = load i32, ptr addrspace(4) %imp_arg_ptr + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inreg %out, i32 inreg) #0 { +; GFX940-LABEL: preload_unused_arg_block_count_x: +; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-NEXT: ; %bb.0: +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX90a-LABEL: preload_unused_arg_block_count_x: +; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-NEXT: ; %bb.0: +; GFX90a-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NEXT: v_mov_b32_e32 v1, s10 +; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: s_endpgm + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %load = load i32, ptr addrspace(4) %imp_arg_ptr + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %out, i256 inreg) { +; GFX940-LABEL: no_free_sgprs_block_count_x: +; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-NEXT: ; %bb.0: +; GFX940-NEXT: s_load_dword s0, s[2:3], 0x28 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NEXT: global_store_dword v0, v1, s[6:7] sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX90a-LABEL: no_free_sgprs_block_count_x: +; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-NEXT: ; %bb.0: +; GFX90a-NEXT: s_load_dword s0, s[6:7], 0x28 +; GFX90a-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-NEXT: global_store_dword v0, v1, s[10:11] +; GFX90a-NEXT: s_endpgm + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %load = load i32, ptr addrspace(4) %imp_arg_ptr + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @no_inreg_block_count_x(ptr addrspace(1) %out) #0 { +; GFX940-LABEL: no_inreg_block_count_x: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX90a-LABEL: no_inreg_block_count_x: +; GFX90a: ; %bb.0: +; GFX90a-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-NEXT: s_endpgm + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %load = load i32, ptr addrspace(4) %imp_arg_ptr + store i32 %load, ptr addrspace(1) %out + ret void +} + +; Implicit arg preloading is currently restricted to cases where all explicit +; args are inreg (preloaded). + +define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32 inreg) #0 { +; GFX940-LABEL: mixed_inreg_block_count_x: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x10 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX90a-LABEL: mixed_inreg_block_count_x: +; GFX90a: ; %bb.0: +; GFX90a-NEXT: s_load_dword s2, s[4:5], 0x10 +; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-NEXT: s_endpgm + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %load = load i32, ptr addrspace(4) %imp_arg_ptr + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inreg %out) #0 { +; GFX940-LABEL: incorrect_type_i64_block_count_x: +; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-NEXT: ; %bb.0: +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX90a-LABEL: incorrect_type_i64_block_count_x: +; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-NEXT: ; %bb.0: +; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90a-NEXT: v_mov_b32_e32 v2, 0 +; GFX90a-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX90a-NEXT: s_endpgm + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %load = load i64, ptr addrspace(4) %imp_arg_ptr + store i64 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inreg %out) #0 { +; GFX940-LABEL: incorrect_type_i16_block_count_x: +; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-NEXT: ; %bb.0: +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX90a-LABEL: incorrect_type_i16_block_count_x: +; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-NEXT: ; %bb.0: +; GFX90a-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX90a-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-NEXT: s_endpgm + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %load = load i16, ptr addrspace(4) %imp_arg_ptr + store i16 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out) #0 { +; GFX940-LABEL: preload_block_count_y: +; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-NEXT: ; %bb.0: +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX90a-LABEL: preload_block_count_y: +; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-NEXT: ; %bb.0: +; GFX90a-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NEXT: v_mov_b32_e32 v1, s9 +; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: s_endpgm + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4 + %load = load i32, ptr addrspace(4) %gep + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) #0 { +; GFX940-LABEL: random_incorrect_offset: +; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-NEXT: ; %bb.0: +; GFX940-NEXT: s_mov_b32 s4, 8 +; GFX940-NEXT: s_load_dword s0, s[0:1], s4 offset:0x2 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX90a-LABEL: random_incorrect_offset: +; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-NEXT: ; %bb.0: +; GFX90a-NEXT: s_mov_b32 s0, 8 +; GFX90a-NEXT: s_load_dword s0, s[4:5], s0 offset:0x2 +; GFX90a-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: s_endpgm + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2 + %load = load i32, ptr addrspace(4) %gep + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out) #0 { +; GFX940-LABEL: preload_block_count_z: +; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-NEXT: ; %bb.0: +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX90a-LABEL: preload_block_count_z: +; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-NEXT: ; %bb.0: +; GFX90a-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NEXT: v_mov_b32_e32 v1, s10 +; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: s_endpgm + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8 + %load = load i32, ptr addrspace(4) %gep + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspace(1) inreg %out, i8 inreg %val) #0 { +; GFX940-LABEL: preload_block_count_x_imparg_align_ptr_i8: +; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-NEXT: ; %bb.0: +; GFX940-NEXT: s_and_b32 s0, s4, 0xff +; GFX940-NEXT: s_add_i32 s0, s6, s0 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX90a-LABEL: preload_block_count_x_imparg_align_ptr_i8: +; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-NEXT: ; %bb.0: +; GFX90a-NEXT: s_and_b32 s0, s8, 0xff +; GFX90a-NEXT: s_add_i32 s0, s10, s0 +; GFX90a-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: s_endpgm + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %load = load i32, ptr addrspace(4) %imp_arg_ptr + %ext = zext i8 %val to i32 + %add = add i32 %load, %ext + store i32 %add, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out) #0 { +; GFX940-LABEL: preload_block_count_xyz: +; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-NEXT: ; %bb.0: +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX90a-LABEL: preload_block_count_xyz: +; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-NEXT: ; %bb.0: +; GFX90a-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-NEXT: v_mov_b32_e32 v0, s8 +; GFX90a-NEXT: v_mov_b32_e32 v1, s9 +; GFX90a-NEXT: v_mov_b32_e32 v2, s10 +; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-NEXT: s_endpgm + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 0 + %load_x = load i32, ptr addrspace(4) %gep_x + %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4 + %load_y = load i32, ptr addrspace(4) %gep_y + %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8 + %load_z = load i32, ptr addrspace(4) %gep_z + %ins.0 = insertelement <3 x i32> poison, i32 %load_x, i32 0 + %ins.1 = insertelement <3 x i32> %ins.0, i32 %load_y, i32 1 + %ins.2 = insertelement <3 x i32> %ins.1, i32 %load_z, i32 2 + store <3 x i32> %ins.2, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out) #0 { +; GFX940-LABEL: preload_workgroup_size_x: +; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-NEXT: ; %bb.0: +; GFX940-NEXT: s_and_b32 s0, s7, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX90a-LABEL: preload_workgroup_size_x: +; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-NEXT: ; %bb.0: +; GFX90a-NEXT: s_and_b32 s0, s11, 0xffff +; GFX90a-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: s_endpgm + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12 + %load = load i16, ptr addrspace(4) %gep + %conv = zext i16 %load to i32 + store i32 %conv, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out) #0 { +; GFX940-LABEL: preload_workgroup_size_y: +; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-NEXT: ; %bb.0: +; GFX940-NEXT: s_lshr_b32 s0, s7, 16 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX90a-LABEL: preload_workgroup_size_y: +; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-NEXT: ; %bb.0: +; GFX90a-NEXT: s_lshr_b32 s0, s11, 16 +; GFX90a-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: s_endpgm + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14 + %load = load i16, ptr addrspace(4) %gep + %conv = zext i16 %load to i32 + store i32 %conv, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out) #0 { +; GFX940-LABEL: preload_workgroup_size_z: +; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-NEXT: ; %bb.0: +; GFX940-NEXT: s_and_b32 s0, s8, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX90a-LABEL: preload_workgroup_size_z: +; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-NEXT: ; %bb.0: +; GFX90a-NEXT: s_and_b32 s0, s12, 0xffff +; GFX90a-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: s_endpgm + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16 + %load = load i16, ptr addrspace(4) %gep + %conv = zext i16 %load to i32 + store i32 %conv, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %out) #0 { +; GFX940-LABEL: preload_workgroup_size_xyz: +; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-NEXT: ; %bb.0: +; GFX940-NEXT: s_lshr_b32 s0, s7, 16 +; GFX940-NEXT: s_and_b32 s1, s7, 0xffff +; GFX940-NEXT: s_and_b32 s4, s8, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX90a-LABEL: preload_workgroup_size_xyz: +; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-NEXT: ; %bb.0: +; GFX90a-NEXT: s_lshr_b32 s0, s11, 16 +; GFX90a-NEXT: s_and_b32 s1, s11, 0xffff +; GFX90a-NEXT: s_and_b32 s2, s12, 0xffff +; GFX90a-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-NEXT: v_mov_b32_e32 v0, s1 +; GFX90a-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-NEXT: s_endpgm + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12 + %load_x = load i16, ptr addrspace(4) %gep_x + %conv_x = zext i16 %load_x to i32 + %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14 + %load_y = load i16, ptr addrspace(4) %gep_y + %conv_y = zext i16 %load_y to i32 + %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16 + %load_z = load i16, ptr addrspace(4) %gep_z + %conv_z = zext i16 %load_z to i32 + %ins.0 = insertelement <3 x i32> poison, i32 %conv_x, i32 0 + %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1 + %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2 + store <3 x i32> %ins.2, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out) #0 { +; GFX940-LABEL: preload_remainder_x: +; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-NEXT: ; %bb.0: +; GFX940-NEXT: s_lshr_b32 s0, s8, 16 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX90a-LABEL: preload_remainder_x: +; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-NEXT: ; %bb.0: +; GFX90a-NEXT: s_lshr_b32 s0, s12, 16 +; GFX90a-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: s_endpgm + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18 + %load = load i16, ptr addrspace(4) %gep + %conv = zext i16 %load to i32 + store i32 %conv, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out) #0 { +; GFX940-LABEL: preloadremainder_y: +; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-NEXT: ; %bb.0: +; GFX940-NEXT: s_and_b32 s0, s9, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX90a-LABEL: preloadremainder_y: +; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-NEXT: ; %bb.0: +; GFX90a-NEXT: s_and_b32 s0, s13, 0xffff +; GFX90a-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: s_endpgm + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20 + %load = load i16, ptr addrspace(4) %gep + %conv = zext i16 %load to i32 + store i32 %conv, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out) #0 { +; GFX940-LABEL: preloadremainder_z: +; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-NEXT: ; %bb.0: +; GFX940-NEXT: s_lshr_b32 s0, s9, 16 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX90a-LABEL: preloadremainder_z: +; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-NEXT: ; %bb.0: +; GFX90a-NEXT: s_lshr_b32 s0, s13, 16 +; GFX90a-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: s_endpgm + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 + %load = load i16, ptr addrspace(4) %gep + %conv = zext i16 %load to i32 + store i32 %conv, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0 { +; GFX940-LABEL: preloadremainder_xyz: +; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-NEXT: ; %bb.0: +; GFX940-NEXT: s_lshr_b32 s0, s9, 16 +; GFX940-NEXT: s_lshr_b32 s1, s8, 16 +; GFX940-NEXT: s_and_b32 s4, s9, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: v_mov_b32_e32 v2, s0 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX90a-LABEL: preloadremainder_xyz: +; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-NEXT: ; %bb.0: +; GFX90a-NEXT: s_lshr_b32 s0, s13, 16 +; GFX90a-NEXT: s_lshr_b32 s1, s12, 16 +; GFX90a-NEXT: s_and_b32 s2, s13, 0xffff +; GFX90a-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-NEXT: v_mov_b32_e32 v0, s1 +; GFX90a-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-NEXT: v_mov_b32_e32 v2, s0 +; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-NEXT: s_endpgm + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18 + %load_x = load i16, ptr addrspace(4) %gep_x + %conv_x = zext i16 %load_x to i32 + %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20 + %load_y = load i16, ptr addrspace(4) %gep_y + %conv_y = zext i16 %load_y to i32 + %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 + %load_z = load i16, ptr addrspace(4) %gep_z + %conv_z = zext i16 %load_z to i32 + %ins.0 = insertelement <3 x i32> poison, i32 %conv_x, i32 0 + %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1 + %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2 + store <3 x i32> %ins.2, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inreg %out) { +; GFX940-LABEL: no_free_sgprs_preloadremainder_z: +; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-NEXT: ; %bb.0: +; GFX940-NEXT: s_load_dword s0, s[2:3], 0x1c +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NEXT: global_store_dword v0, v1, s[6:7] sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX90a-LABEL: no_free_sgprs_preloadremainder_z: +; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-NEXT: ; %bb.0: +; GFX90a-NEXT: s_load_dword s0, s[6:7], 0x1c +; GFX90a-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NEXT: s_lshr_b32 s0, s0, 16 +; GFX90a-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-NEXT: global_store_dword v0, v1, s[10:11] +; GFX90a-NEXT: s_endpgm + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 + %load = load i16, ptr addrspace(4) %gep + %conv = zext i16 %load to i32 + store i32 %conv, ptr addrspace(1) %out + ret void +} + +attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }