diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index b572f81defe2d..5a917734e9c74 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -65,6 +65,7 @@ ModulePass *createAMDGPULowerBufferFatPointersPass(); FunctionPass *createSIModeRegisterPass(); FunctionPass *createGCNPreRAOptimizationsLegacyPass(); FunctionPass *createAMDGPUPreloadKernArgPrologLegacyPass(); +ModulePass *createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *); struct AMDGPUSimplifyLibCallsPass : PassInfoMixin { AMDGPUSimplifyLibCallsPass() {} @@ -233,6 +234,9 @@ extern char &GCNRegPressurePrinterID; void initializeAMDGPUPreloadKernArgPrologLegacyPass(PassRegistry &); extern char &AMDGPUPreloadKernArgPrologLegacyID; +void initializeAMDGPUPreloadKernelArgumentsLegacyPass(PassRegistry &); +extern char &AMDGPUPreloadKernelArgumentsLegacyID; + // Passes common to R600 and SI FunctionPass *createAMDGPUPromoteAlloca(); void initializeAMDGPUPromoteAllocaPass(PassRegistry&); @@ -347,6 +351,16 @@ class AMDGPUAttributorPass : public PassInfoMixin { PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); }; +class AMDGPUPreloadKernelArgumentsPass + : public PassInfoMixin { + const TargetMachine &TM; + +public: + explicit AMDGPUPreloadKernelArgumentsPass(const TargetMachine &TM) : TM(TM) {} + + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + class AMDGPUAnnotateUniformValuesPass : public PassInfoMixin { public: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 78e75f888c99c..433144a60d120 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -25,10 +25,6 @@ using namespace llvm; -static cl::opt KernargPreloadCount( - "amdgpu-kernarg-preload-count", - cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0)); - static cl::opt IndirectCallSpecializationThreshold( "amdgpu-indirect-call-specialization-threshold", cl::desc( @@ -1327,21 +1323,6 @@ struct AAAMDGPUNoAGPR const char AAAMDGPUNoAGPR::ID = 0; -static void addPreloadKernArgHint(Function &F, TargetMachine &TM) { - const GCNSubtarget &ST = TM.getSubtarget(F); - for (unsigned I = 0; - I < F.arg_size() && - I < std::min(KernargPreloadCount.getValue(), ST.getMaxNumUserSGPRs()); - ++I) { - Argument &Arg = *F.getArg(I); - // Check for incompatible attributes. - if (Arg.hasByRefAttr() || Arg.hasNestAttr()) - break; - - Arg.addAttr(Attribute::InReg); - } -} - static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, AMDGPUAttributorOptions Options, ThinOrFullLTOPhase LTOPhase) { @@ -1396,8 +1377,6 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, if (!AMDGPU::isEntryFunctionCC(CC)) { A.getOrCreateAAFor(IRPosition::function(*F)); A.getOrCreateAAFor(IRPosition::function(*F)); - } else if (CC == CallingConv::AMDGPU_KERNEL) { - addPreloadKernArgHint(*F, TM); } for (auto &I : instructions(F)) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index a4e6768b4630d..dec781d71c54e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -27,231 +27,6 @@ using namespace llvm; namespace { -class PreloadKernelArgInfo { -private: - Function &F; - const GCNSubtarget &ST; - unsigned NumFreeUserSGPRs; - - enum HiddenArg : unsigned { - HIDDEN_BLOCK_COUNT_X, - HIDDEN_BLOCK_COUNT_Y, - HIDDEN_BLOCK_COUNT_Z, - HIDDEN_GROUP_SIZE_X, - HIDDEN_GROUP_SIZE_Y, - HIDDEN_GROUP_SIZE_Z, - HIDDEN_REMAINDER_X, - HIDDEN_REMAINDER_Y, - HIDDEN_REMAINDER_Z, - END_HIDDEN_ARGS - }; - - // Stores information about a specific hidden argument. - struct HiddenArgInfo { - // Offset in bytes from the location in the kernearg segment pointed to by - // the implicitarg pointer. - uint8_t Offset; - // The size of the hidden argument in bytes. - uint8_t Size; - // The name of the hidden argument in the kernel signature. - const char *Name; - }; - - static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = { - {0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"}, - {8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"}, - {14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"}, - {18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"}, - {22, 2, "_hidden_remainder_z"}}; - - static HiddenArg getHiddenArgFromOffset(unsigned Offset) { - for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I) - if (HiddenArgs[I].Offset == Offset) - return static_cast(I); - - return END_HIDDEN_ARGS; - } - - static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) { - if (HA < END_HIDDEN_ARGS) - return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8); - - llvm_unreachable("Unexpected hidden argument."); - } - - static const char *getHiddenArgName(HiddenArg HA) { - if (HA < END_HIDDEN_ARGS) { - return HiddenArgs[HA].Name; - } - llvm_unreachable("Unexpected hidden argument."); - } - - // Clones the function after adding implicit arguments to the argument list - // and returns the new updated function. Preloaded implicit arguments are - // added up to and including the last one that will be preloaded, indicated by - // LastPreloadIndex. Currently preloading is only performed on the totality of - // sequential data from the kernarg segment including implicit (hidden) - // arguments. This means that all arguments up to the last preloaded argument - // will also be preloaded even if that data is unused. - Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) { - FunctionType *FT = F.getFunctionType(); - LLVMContext &Ctx = F.getParent()->getContext(); - SmallVector FTypes(FT->param_begin(), FT->param_end()); - for (unsigned I = 0; I <= LastPreloadIndex; ++I) - FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I))); - - FunctionType *NFT = - FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg()); - Function *NF = - Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName()); - - NF->copyAttributesFrom(&F); - NF->copyMetadata(&F, 0); - NF->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat); - - F.getParent()->getFunctionList().insert(F.getIterator(), NF); - NF->takeName(&F); - NF->splice(NF->begin(), &F); - - Function::arg_iterator NFArg = NF->arg_begin(); - for (Argument &Arg : F.args()) { - Arg.replaceAllUsesWith(&*NFArg); - NFArg->takeName(&Arg); - ++NFArg; - } - - AttrBuilder AB(Ctx); - AB.addAttribute(Attribute::InReg); - AB.addAttribute("amdgpu-hidden-argument"); - AttributeList AL = NF->getAttributes(); - for (unsigned I = 0; I <= LastPreloadIndex; ++I) { - AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB); - NFArg++->setName(getHiddenArgName(HiddenArg(I))); - } - - NF->setAttributes(AL); - F.replaceAllUsesWith(NF); - F.setCallingConv(CallingConv::C); - F.clearMetadata(); - - return NF; - } - -public: - PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) { - setInitialFreeUserSGPRsCount(); - } - - // Returns the maximum number of user SGPRs that we have available to preload - // arguments. - void setInitialFreeUserSGPRsCount() { - GCNUserSGPRUsageInfo UserSGPRInfo(F, ST); - NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs(); - } - - bool tryAllocPreloadSGPRs(unsigned AllocSize, uint64_t ArgOffset, - uint64_t LastExplicitArgOffset) { - // Check if this argument may be loaded into the same register as the - // previous argument. - if (ArgOffset - LastExplicitArgOffset < 4 && - !isAligned(Align(4), ArgOffset)) - return true; - - // Pad SGPRs for kernarg alignment. - ArgOffset = alignDown(ArgOffset, 4); - unsigned Padding = ArgOffset - LastExplicitArgOffset; - unsigned PaddingSGPRs = alignTo(Padding, 4) / 4; - unsigned NumPreloadSGPRs = alignTo(AllocSize, 4) / 4; - if (NumPreloadSGPRs + PaddingSGPRs > NumFreeUserSGPRs) - return false; - - NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs); - return true; - } - - // Try to allocate SGPRs to preload implicit kernel arguments. - void tryAllocImplicitArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset, - uint64_t LastExplicitArgOffset, - IRBuilder<> &Builder) { - Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists( - F.getParent(), Intrinsic::amdgcn_implicitarg_ptr); - if (!ImplicitArgPtr) - return; - - const DataLayout &DL = F.getParent()->getDataLayout(); - // Pair is the load and the load offset. - SmallVector, 4> ImplicitArgLoads; - for (auto *U : ImplicitArgPtr->users()) { - Instruction *CI = dyn_cast(U); - if (!CI || CI->getParent()->getParent() != &F) - continue; - - for (auto *U : CI->users()) { - int64_t Offset = 0; - auto *Load = dyn_cast(U); // Load from ImplicitArgPtr? - if (!Load) { - if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI) - continue; - - Load = dyn_cast(*U->user_begin()); // Load from GEP? - } - - if (!Load || !Load->isSimple()) - continue; - - // FIXME: Expand to handle 64-bit implicit args and large merged loads. - LLVMContext &Ctx = F.getParent()->getContext(); - Type *LoadTy = Load->getType(); - HiddenArg HA = getHiddenArgFromOffset(Offset); - if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA)) - continue; - - ImplicitArgLoads.push_back(std::make_pair(Load, Offset)); - } - } - - if (ImplicitArgLoads.empty()) - return; - - // Allocate loads in order of offset. We need to be sure that the implicit - // argument can actually be preloaded. - std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second()); - - // If we fail to preload any implicit argument we know we don't have SGPRs - // to preload any subsequent ones with larger offsets. Find the first - // argument that we cannot preload. - auto *PreloadEnd = std::find_if( - ImplicitArgLoads.begin(), ImplicitArgLoads.end(), - [&](const std::pair &Load) { - unsigned LoadSize = DL.getTypeStoreSize(Load.first->getType()); - unsigned LoadOffset = Load.second; - if (!tryAllocPreloadSGPRs(LoadSize, - LoadOffset + ImplicitArgsBaseOffset, - LastExplicitArgOffset)) - return true; - - LastExplicitArgOffset = - ImplicitArgsBaseOffset + LoadOffset + LoadSize; - return false; - }); - - if (PreloadEnd == ImplicitArgLoads.begin()) - return; - - unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second); - Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex); - assert(NF); - for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) { - LoadInst *LoadInst = I->first; - unsigned LoadOffset = I->second; - unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset); - unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1; - Argument *Arg = NF->getArg(Index); - LoadInst->replaceAllUsesWith(Arg); - } - } -}; - class AMDGPULowerKernelArguments : public FunctionPass { public: static char ID; @@ -311,10 +86,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize)); uint64_t ExplicitArgOffset = 0; - // Preloaded kernel arguments must be sequential. - bool InPreloadSequence = true; - PreloadKernelArgInfo PreloadInfo(F, ST); - for (Argument &Arg : F.args()) { const bool IsByRef = Arg.hasByRefAttr(); Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); @@ -325,25 +96,10 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset; - uint64_t LastExplicitArgOffset = ExplicitArgOffset; ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize; - // Guard against the situation where hidden arguments have already been - // lowered and added to the kernel function signiture, i.e. in a situation - // where this pass has run twice. - if (Arg.hasAttribute("amdgpu-hidden-argument")) - break; - - // Try to preload this argument into user SGPRs. - if (Arg.hasInRegAttr() && InPreloadSequence && ST.hasKernargPreload() && - !Arg.getType()->isAggregateType()) - if (PreloadInfo.tryAllocPreloadSGPRs(AllocSize, EltOffset, - LastExplicitArgOffset)) - continue; - - InPreloadSequence = false; - - if (Arg.use_empty()) + // Skip inreg arguments which should be preloaded. + if (Arg.use_empty() || Arg.hasInRegAttr()) continue; // If this is byval, the loads are already explicit in the function. We just @@ -483,14 +239,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { KernArgSegment->addRetAttr( Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign))); - if (InPreloadSequence) { - uint64_t ImplicitArgsBaseOffset = - alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) + - BaseOffset; - PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset, - ExplicitArgOffset, Builder); - } - return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 98a1147ef6d66..13453963eec6d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -25,6 +25,7 @@ MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this)) MODULE_PASS("amdgpu-perf-hint", AMDGPUPerfHintAnalysisPass( *static_cast(this))) +MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(*this)) MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass()) MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this)) MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this)) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp new file mode 100644 index 0000000000000..c1626b4fac869 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp @@ -0,0 +1,358 @@ +//===- AMDGPUPreloadKernelArguments.cpp - Preload Kernel Arguments --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This pass preloads kernel arguments into user_data SGPRs before kernel +/// execution begins. The number of registers available for preloading depends +/// on the number of free user SGPRs, up to the hardware's maximum limit. +/// Implicit arguments enabled in the kernel descriptor are allocated first, +/// followed by SGPRs used for preloaded kernel arguments. (Reference: +/// https://llvm.org/docs/AMDGPUUsage.html#initial-kernel-execution-state) +/// Additionally, hidden kernel arguments may be preloaded, in which case they +/// are appended to the kernel signature after explicit arguments. Preloaded +/// arguments will be marked with `inreg`. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUTargetMachine.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Pass.h" + +#define DEBUG_TYPE "amdgpu-preload-kernel-arguments" + +using namespace llvm; + +static cl::opt KernargPreloadCount( + "amdgpu-kernarg-preload-count", + cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0)); + +namespace { + +class AMDGPUPreloadKernelArgumentsLegacy : public ModulePass { + const GCNTargetMachine *TM; + +public: + static char ID; + explicit AMDGPUPreloadKernelArgumentsLegacy( + const GCNTargetMachine *TM = nullptr); + + StringRef getPassName() const override { + return "AMDGPU Preload Kernel Arguments"; + } + + bool runOnModule(Module &M) override; +}; + +class PreloadKernelArgInfo { +private: + Function &F; + const GCNSubtarget &ST; + unsigned NumFreeUserSGPRs; + + enum HiddenArg : unsigned { + HIDDEN_BLOCK_COUNT_X, + HIDDEN_BLOCK_COUNT_Y, + HIDDEN_BLOCK_COUNT_Z, + HIDDEN_GROUP_SIZE_X, + HIDDEN_GROUP_SIZE_Y, + HIDDEN_GROUP_SIZE_Z, + HIDDEN_REMAINDER_X, + HIDDEN_REMAINDER_Y, + HIDDEN_REMAINDER_Z, + END_HIDDEN_ARGS + }; + + // Stores information about a specific hidden argument. + struct HiddenArgInfo { + // Offset in bytes from the location in the kernearg segment pointed to by + // the implicitarg pointer. + uint8_t Offset; + // The size of the hidden argument in bytes. + uint8_t Size; + // The name of the hidden argument in the kernel signature. + const char *Name; + }; + + static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = { + {0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"}, + {8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"}, + {14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"}, + {18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"}, + {22, 2, "_hidden_remainder_z"}}; + + static HiddenArg getHiddenArgFromOffset(unsigned Offset) { + for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I) + if (HiddenArgs[I].Offset == Offset) + return static_cast(I); + + return END_HIDDEN_ARGS; + } + + static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) { + if (HA < END_HIDDEN_ARGS) + return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8); + + llvm_unreachable("Unexpected hidden argument."); + } + + static const char *getHiddenArgName(HiddenArg HA) { + if (HA < END_HIDDEN_ARGS) + return HiddenArgs[HA].Name; + + llvm_unreachable("Unexpected hidden argument."); + } + + // Clones the function after adding implicit arguments to the argument list + // and returns the new updated function. Preloaded implicit arguments are + // added up to and including the last one that will be preloaded, indicated by + // LastPreloadIndex. Currently preloading is only performed on the totality of + // sequential data from the kernarg segment including implicit (hidden) + // arguments. This means that all arguments up to the last preloaded argument + // will also be preloaded even if that data is unused. + Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) { + FunctionType *FT = F.getFunctionType(); + LLVMContext &Ctx = F.getParent()->getContext(); + SmallVector FTypes(FT->param_begin(), FT->param_end()); + for (unsigned I = 0; I <= LastPreloadIndex; ++I) + FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I))); + + FunctionType *NFT = + FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg()); + Function *NF = + Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName()); + + NF->copyAttributesFrom(&F); + NF->copyMetadata(&F, 0); + NF->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat); + + F.getParent()->getFunctionList().insert(F.getIterator(), NF); + NF->takeName(&F); + NF->splice(NF->begin(), &F); + + Function::arg_iterator NFArg = NF->arg_begin(); + for (Argument &Arg : F.args()) { + Arg.replaceAllUsesWith(&*NFArg); + NFArg->takeName(&Arg); + ++NFArg; + } + + AttrBuilder AB(Ctx); + AB.addAttribute(Attribute::InReg); + AB.addAttribute("amdgpu-hidden-argument"); + AttributeList AL = NF->getAttributes(); + for (unsigned I = 0; I <= LastPreloadIndex; ++I) { + AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB); + NFArg++->setName(getHiddenArgName(HiddenArg(I))); + } + + NF->setAttributes(AL); + F.replaceAllUsesWith(NF); + + return NF; + } + +public: + PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) { + setInitialFreeUserSGPRsCount(); + } + + // Returns the maximum number of user SGPRs that we have available to preload + // arguments. + void setInitialFreeUserSGPRsCount() { + GCNUserSGPRUsageInfo UserSGPRInfo(F, ST); + NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs(); + } + + bool canPreloadKernArgAtOffset(uint64_t ExplicitArgOffset) { + return ExplicitArgOffset <= NumFreeUserSGPRs * 4; + } + + // Try to allocate SGPRs to preload hidden kernel arguments. + void + tryAllocHiddenArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset, + SmallVectorImpl &FunctionsToErase) { + Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists( + F.getParent(), Intrinsic::amdgcn_implicitarg_ptr); + if (!ImplicitArgPtr) + return; + + const DataLayout &DL = F.getParent()->getDataLayout(); + // Pair is the load and the load offset. + SmallVector, 4> ImplicitArgLoads; + for (auto *U : ImplicitArgPtr->users()) { + Instruction *CI = dyn_cast(U); + if (!CI || CI->getParent()->getParent() != &F) + continue; + + for (auto *U : CI->users()) { + int64_t Offset = 0; + auto *Load = dyn_cast(U); // Load from ImplicitArgPtr? + if (!Load) { + if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI) + continue; + + Load = dyn_cast(*U->user_begin()); // Load from GEP? + } + + if (!Load || !Load->isSimple()) + continue; + + // FIXME: Expand handle merged loads. + LLVMContext &Ctx = F.getParent()->getContext(); + Type *LoadTy = Load->getType(); + HiddenArg HA = getHiddenArgFromOffset(Offset); + if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA)) + continue; + + ImplicitArgLoads.push_back(std::make_pair(Load, Offset)); + } + } + + if (ImplicitArgLoads.empty()) + return; + + // Allocate loads in order of offset. We need to be sure that the implicit + // argument can actually be preloaded. + std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second()); + + // If we fail to preload any implicit argument we know we don't have SGPRs + // to preload any subsequent ones with larger offsets. Find the first + // argument that we cannot preload. + auto *PreloadEnd = + std::find_if(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), + [&](const std::pair &Load) { + unsigned LoadSize = + DL.getTypeStoreSize(Load.first->getType()); + unsigned LoadOffset = Load.second; + if (!canPreloadKernArgAtOffset(LoadOffset + LoadSize + + ImplicitArgsBaseOffset)) + return true; + + return false; + }); + + if (PreloadEnd == ImplicitArgLoads.begin()) + return; + + unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second); + Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex); + assert(NF); + FunctionsToErase.push_back(&F); + for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) { + LoadInst *LoadInst = I->first; + unsigned LoadOffset = I->second; + unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset); + unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1; + Argument *Arg = NF->getArg(Index); + LoadInst->replaceAllUsesWith(Arg); + } + } +}; + +} // end anonymous namespace + +char AMDGPUPreloadKernelArgumentsLegacy::ID = 0; + +INITIALIZE_PASS(AMDGPUPreloadKernelArgumentsLegacy, DEBUG_TYPE, + "AMDGPU Preload Kernel Arguments", false, false) + +ModulePass * +llvm::createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *TM) { + return new AMDGPUPreloadKernelArgumentsLegacy( + static_cast(TM)); +} + +AMDGPUPreloadKernelArgumentsLegacy::AMDGPUPreloadKernelArgumentsLegacy( + const GCNTargetMachine *TM) + : ModulePass(ID), TM(TM) {} + +static bool markKernelArgsAsInreg(Module &M, const TargetMachine &TM) { + SmallVector FunctionsToErase; + bool Changed = false; + for (auto &F : M) { + const GCNSubtarget &ST = TM.getSubtarget(F); + if (!ST.hasKernargPreload() || + F.getCallingConv() != CallingConv::AMDGPU_KERNEL) + continue; + + PreloadKernelArgInfo PreloadInfo(F, ST); + uint64_t ExplicitArgOffset = 0; + const DataLayout &DL = F.getDataLayout(); + const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(); + unsigned NumPreloadsRequested = KernargPreloadCount; + unsigned NumPreloadedExplicitArgs = 0; + for (Argument &Arg : F.args()) { + // Avoid incompatible attributes and guard against running this pass + // twice. + // + // TODO: Preload byref kernel arguments + if (Arg.hasByRefAttr() || Arg.hasNestAttr() || + Arg.hasAttribute("amdgpu-hidden-argument")) + break; + + // Inreg may be pre-existing on some arguments, try to preload these. + if (NumPreloadsRequested == 0 && !Arg.hasInRegAttr()) + break; + + // FIXME: Preload aggregates. + if (Arg.getType()->isAggregateType()) + break; + + Type *ArgTy = Arg.getType(); + Align ABITypeAlign = DL.getABITypeAlign(ArgTy); + uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); + ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize; + + if (!PreloadInfo.canPreloadKernArgAtOffset(ExplicitArgOffset)) + break; + + Arg.addAttr(Attribute::InReg); + NumPreloadedExplicitArgs++; + if (NumPreloadsRequested > 0) + NumPreloadsRequested--; + } + + // Only try preloading hidden arguments if we can successfully preload the + // last explicit argument. + if (NumPreloadedExplicitArgs == F.arg_size()) { + uint64_t ImplicitArgsBaseOffset = + alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) + + BaseOffset; + PreloadInfo.tryAllocHiddenArgPreloadSGPRs(ImplicitArgsBaseOffset, + FunctionsToErase); + } + + Changed |= NumPreloadedExplicitArgs > 0; + } + + // Erase cloned functions if we needed to update the kernel signature to + // support preloading hidden kernel arguments. + for (auto *F : FunctionsToErase) + F->eraseFromParent(); + + return Changed; +} + +bool AMDGPUPreloadKernelArgumentsLegacy::runOnModule(Module &M) { + if (skipModule(M) || !TM) + return false; + + return markKernelArgsAsInreg(M, *TM); +} + +PreservedAnalyses +AMDGPUPreloadKernelArgumentsPass::run(Module &M, ModuleAnalysisManager &AM) { + bool Changed = markKernelArgsAsInreg(M, TM); + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index c22b27abdbf6c..ccb251b730f16 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -566,6 +566,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeGCNRegPressurePrinterPass(*PR); initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR); initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR); + initializeAMDGPUPreloadKernelArgumentsLegacyPass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -1321,6 +1322,10 @@ void AMDGPUPassConfig::addIRPasses() { } void AMDGPUPassConfig::addCodeGenPrepare() { + if (TM->getTargetTriple().isAMDGCN() && + TM->getOptLevel() > CodeGenOptLevel::None) + addPass(createAMDGPUPreloadKernelArgumentsLegacyPass(TM)); + if (TM->getTargetTriple().isAMDGCN() && EnableLowerKernelArguments) addPass(createAMDGPULowerKernelArgumentsPass()); @@ -2050,6 +2055,9 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const { // AMDGPUAnnotateKernelFeaturesPass is missing here, but it will hopefully be // deleted soon. + if (TM.getOptLevel() > CodeGenOptLevel::None) + addPass(AMDGPUPreloadKernelArgumentsPass(TM)); + if (EnableLowerKernelArguments) addPass(AMDGPULowerKernelArgumentsPass(TM)); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 09a3096602fc3..c6d70ee39202e 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -89,6 +89,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUPostLegalizerCombiner.cpp AMDGPUPreLegalizerCombiner.cpp AMDGPUPreloadKernArgProlog.cpp + AMDGPUPreloadKernelArguments.cpp AMDGPUPrintfRuntimeBinding.cpp AMDGPUPromoteAlloca.cpp AMDGPUPromoteKernelArguments.cpp diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll index b3865eefb4f54..5733cf9a44d32 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -625,7 +625,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node ret <4 x float> %r } -define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { +define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 @@ -740,7 +740,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ret void } -define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { +define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 @@ -845,7 +845,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ret void } -define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) { +define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_clause 0x1 @@ -956,7 +956,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ret void } -define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) { +define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd-with-ret.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd-with-ret.ll index b46a82759f6c5..dadd971396a35 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd-with-ret.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd-with-ret.ll @@ -8,7 +8,7 @@ declare <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half>, ptr ; GFX90A-LABEL: {{^}}buffer_atomic_add_f32_rtn: ; GFX90A: buffer_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}], s{{[0-9]+}} offen glc -define amdgpu_kernel void @buffer_atomic_add_f32_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 %soffset) { +define amdgpu_kernel void @buffer_atomic_add_f32_rtn(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset) { main_body: %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) store float %ret, ptr poison @@ -17,7 +17,7 @@ main_body: ; GFX90A-LABEL: {{^}}buffer_atomic_add_v2f16_rtn: ; GFX90A: buffer_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}], s{{[0-9]+}} offen glc -define amdgpu_kernel void @buffer_atomic_add_v2f16_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_kernel void @buffer_atomic_add_v2f16_rtn(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset) { main_body: %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) store <2 x half> %ret, ptr poison diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll index 96603c10787d6..de5398242a1a5 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll @@ -18,7 +18,7 @@ target triple = "amdgcn--" declare void @llvm.memcpy.p7.p7.i32(ptr addrspace(7), ptr addrspace(7), i32, i1) -define amdgpu_kernel void @memcpy_known(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) { +define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) %dst) { ; SDAG-LABEL: memcpy_known: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -235,14 +235,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) inreg %src, ptr addrspa ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] ; SDAG-GFX942-LABEL: memcpy_known: -; SDAG-GFX942: ; %bb.3: -; SDAG-GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 -; SDAG-GFX942-NEXT: s_load_dword s12, s[4:5], 0x10 -; SDAG-GFX942-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX942-NEXT: s_branch .LBB0_0 -; SDAG-GFX942-NEXT: .p2align 8 -; SDAG-GFX942-NEXT: ; %bb.4: -; SDAG-GFX942-NEXT: .LBB0_0: +; SDAG-GFX942: ; %bb.0: ; SDAG-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-GFX942-NEXT: s_load_dword s17, s[4:5], 0x34 ; SDAG-GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x44 @@ -594,7 +587,7 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) inreg %src, ptr addrspa ret void } -define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) { +define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrspace(7) %dst) { ; SDAG-LABEL: memcpy_known_medium: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -802,14 +795,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) inreg %src, ptr ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] ; SDAG-GFX942-LABEL: memcpy_known_medium: -; SDAG-GFX942: ; %bb.3: -; SDAG-GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 -; SDAG-GFX942-NEXT: s_load_dword s12, s[4:5], 0x10 -; SDAG-GFX942-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX942-NEXT: s_branch .LBB1_0 -; SDAG-GFX942-NEXT: .p2align 8 -; SDAG-GFX942-NEXT: ; %bb.4: -; SDAG-GFX942-NEXT: .LBB1_0: +; SDAG-GFX942: ; %bb.0: ; SDAG-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-GFX942-NEXT: s_load_dword s13, s[4:5], 0x34 ; SDAG-GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x44 @@ -1154,7 +1140,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) inreg %src, ptr ret void } -define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) { +define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspace(7) %dst) { ; SDAG-LABEL: memcpy_known_small: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1191,14 +1177,7 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) inreg %src, ptr a ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] ; SDAG-GFX942-LABEL: memcpy_known_small: -; SDAG-GFX942: ; %bb.1: -; SDAG-GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 -; SDAG-GFX942-NEXT: s_load_dword s12, s[4:5], 0x10 -; SDAG-GFX942-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX942-NEXT: s_branch .LBB2_0 -; SDAG-GFX942-NEXT: .p2align 8 -; SDAG-GFX942-NEXT: ; %bb.2: -; SDAG-GFX942-NEXT: .LBB2_0: +; SDAG-GFX942: ; %bb.0: ; SDAG-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-GFX942-NEXT: s_load_dword s13, s[4:5], 0x34 ; SDAG-GFX942-NEXT: s_mov_b32 s12, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 7179f687c70f2..29736b62f2c00 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -228,6 +228,8 @@ ; GCN-O1-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining) ; GCN-O1-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O1-NEXT: Expand reduction intrinsics +; GCN-O1-NEXT: AMDGPU Preload Kernel Arguments +; GCN-O1-NEXT: FunctionPass Manager ; GCN-O1-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O1-NEXT: Lower buffer fat pointer operations to buffer resources ; GCN-O1-NEXT: CallGraph Construction @@ -523,6 +525,8 @@ ; GCN-O1-OPTS-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O1-OPTS-NEXT: Expand reduction intrinsics ; GCN-O1-OPTS-NEXT: Early CSE +; GCN-O1-OPTS-NEXT: AMDGPU Preload Kernel Arguments +; GCN-O1-OPTS-NEXT: FunctionPass Manager ; GCN-O1-OPTS-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O1-OPTS-NEXT: Lower buffer fat pointer operations to buffer resources ; GCN-O1-OPTS-NEXT: CallGraph Construction @@ -836,6 +840,8 @@ ; GCN-O2-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O2-NEXT: Expand reduction intrinsics ; GCN-O2-NEXT: Early CSE +; GCN-O2-NEXT: AMDGPU Preload Kernel Arguments +; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O2-NEXT: Lower buffer fat pointer operations to buffer resources ; GCN-O2-NEXT: CallGraph Construction @@ -1164,6 +1170,8 @@ ; GCN-O3-NEXT: Lazy Block Frequency Analysis ; GCN-O3-NEXT: Optimization Remark Emitter ; GCN-O3-NEXT: Global Value Numbering +; GCN-O3-NEXT: AMDGPU Preload Kernel Arguments +; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O3-NEXT: Lower buffer fat pointer operations to buffer resources ; GCN-O3-NEXT: CallGraph Construction diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll index 9606c68684957..4fa4b73456ecd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -407,7 +407,7 @@ main_body: ; TODO: NSA reassign is very limited and cannot work with VGPR tuples and subregs. -define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { +define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> %tdescr) { ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body ; GFX1013-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 @@ -571,7 +571,7 @@ main_body: ret void } -define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { +define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> %tdescr) { ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body ; GFX1013-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 @@ -719,7 +719,7 @@ main_body: ret void } -define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) { +define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> %tdescr) { ; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body ; GFX1013-NEXT: s_clause 0x1 @@ -880,7 +880,7 @@ main_body: ret void } -define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) { +define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> %tdescr) { ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body ; GFX1013-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll index 8cf7497fca640..da2a3ce6bcbcc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -1485,7 +1485,7 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ret void } -define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 { +define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 %oldval, ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_i32: ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x0 @@ -1570,7 +1570,7 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ret void } -define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 { +define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 %oldval, ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_i64: ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 @@ -1673,7 +1673,7 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr ret void } -define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ptr addrspace(1) %out, double %src0, i32 %src1) #1 { +define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double %oldval, ptr addrspace(1) %out, double %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_f64: ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll index 85839bc472dcf..830d7cc840aeb 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -S < %s | FileCheck -check-prefix=NO-PRELOAD %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=16 -S < %s | FileCheck -check-prefix=PRELOAD %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes='amdgpu-attributor,amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments)' -S < %s | FileCheck -check-prefix=NO-PRELOAD %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes='amdgpu-attributor,amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=16 -S < %s | FileCheck -check-prefix=PRELOAD %s define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) %out) { ; NO-PRELOAD-LABEL: define amdgpu_kernel void @preload_block_count_x( @@ -39,7 +39,7 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, i5 ; NO-PRELOAD-NEXT: ret void ; ; PRELOAD-LABEL: define amdgpu_kernel void @no_free_sgprs_block_count_x( -; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i512 inreg [[TMP0:%.*]]) #[[ATTR0]] { +; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i512 [[TMP0:%.*]]) #[[ATTR0]] { ; PRELOAD-NEXT: [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(328) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info.ll index 89c9801b5e466..1055abe6d3499 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info.ll @@ -1,11 +1,11 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=16 -S < %s 2>&1 \ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes='amdgpu-attributor,amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=16 -S < %s 2>&1 \ ; RUN: | FileCheck --match-full-lines --implicit-check-not='declare' %s ; Confirms we do not leave behind a declaration which references the same ; DISubprogram metadata. ; CHECK: define amdgpu_kernel void @preload_block_count_x{{.*}} !dbg ![[#]] !max_work_group_size ![[#]] { -; CHECK: declare void @0{{.*}} #[[#]] +; CHECK-NOT: declare void @0{{.*}} #[[#]] ; CHECK: declare noundef align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #[[#]] ; CHECK: declare noundef align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #[[#]] diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll index c26f0926d86b2..79b531e3ce785 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll @@ -873,21 +873,17 @@ define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(pt ; ; GFX90a-LABEL: preload_block_count_z_workgroup_size_z_remainder_z: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 -; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x18 +; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB22_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB22_0: -; GFX90a-NEXT: s_load_dword s0, s[4:5], 0x1c +; GFX90a-NEXT: s_lshr_b32 s0, s15, 16 ; GFX90a-NEXT: s_and_b32 s1, s14, 0xffff ; GFX90a-NEXT: v_mov_b32_e32 v3, 0 ; GFX90a-NEXT: v_mov_b32_e32 v0, s12 ; GFX90a-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_lshr_b32 s0, s0, 16 ; GFX90a-NEXT: v_mov_b32_e32 v2, s0 ; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] ; GFX90a-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll index 91bfedd46e6fa..1a445af94b9ad 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-attributor -amdgpu-lower-kernel-arguments -S < %s | FileCheck -check-prefix=NO-PRELOAD %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=1 -S < %s | FileCheck -check-prefix=PRELOAD-1 %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=3 -S < %s | FileCheck -check-prefix=PRELOAD-3 %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=8 -S < %s | FileCheck -check-prefix=PRELOAD-8 %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes='amdgpu-attributor,amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments)' -S < %s | FileCheck -check-prefix=NO-PRELOAD %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes='amdgpu-attributor,amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=1 -S < %s | FileCheck -check-prefix=PRELOAD-1 %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes='amdgpu-attributor,amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=3 -S < %s | FileCheck -check-prefix=PRELOAD-3 %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes='amdgpu-attributor,amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=8 -S < %s | FileCheck -check-prefix=PRELOAD-8 %s define amdgpu_kernel void @test_preload_IR_lowering_kernel_2(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { ; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2 @@ -185,7 +185,7 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_8(ptr addrspace(1) %i ; PRELOAD-3-NEXT: ret void ; ; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8 -; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) inreg [[IN3:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]], ptr addrspace(1) inreg [[OUT3:%.*]]) #[[ATTR0]] { +; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) inreg [[IN3:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] { ; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-8-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56 ; PRELOAD-8-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0:![0-9]+]] @@ -220,14 +220,10 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset(ptr ad ; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] ; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 8 ; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 16 -; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 24 -; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] ; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 ; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 ; NO-PRELOAD-NEXT: ret void ; ; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset @@ -235,14 +231,10 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset(ptr ad ; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 8 ; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 16 -; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 24 -; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] ; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 ; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 ; PRELOAD-1-NEXT: ret void ; ; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset @@ -270,22 +262,16 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset(ptr ad ret void } -; Only preload the first sequence of arguments with the inreg attribute. In the NO-PRELOAD case this is just the first argument. - define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence(ptr addrspace(1) inreg %in, ptr addrspace(1) %in1, ptr addrspace(1) inreg %out, ptr addrspace(1) inreg %out1) #0 { ; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence ; NO-PRELOAD-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { ; NO-PRELOAD-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 8 ; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 16 -; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 24 -; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] ; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 ; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 ; NO-PRELOAD-NEXT: ret void ; ; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence @@ -293,14 +279,10 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset_two_se ; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 8 ; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 16 -; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 24 -; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] ; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 ; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 ; PRELOAD-1-NEXT: ret void ; ; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll deleted file mode 100644 index 20edbd6c0d0fa..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll +++ /dev/null @@ -1,263 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor -S < %s | FileCheck -check-prefix=NO-PRELOAD %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=1 -passes=amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-1 %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=3 -passes=amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-3 %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=16 -passes=amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-16 %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=20 -passes=amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-20 %s - -define amdgpu_kernel void @test_preload_hint_kernel_1(ptr %0) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1 -; NO-PRELOAD-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1 -; PRELOAD-1-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1 -; PRELOAD-3-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1 -; PRELOAD-16-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1 -; PRELOAD-20-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -define amdgpu_kernel void @test_preload_hint_kernel_2(i32 %0, i64 %1) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2 -; NO-PRELOAD-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2 -; PRELOAD-1-SAME: (i32 inreg [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2 -; PRELOAD-3-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2 -; PRELOAD-16-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2 -; PRELOAD-20-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -define amdgpu_kernel void @test_preload_hint_kernel_4(i32 %0, i64 %1, <2 x float> %2, ptr %3) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_4 -; NO-PRELOAD-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]], <2 x float> [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_4 -; PRELOAD-1-SAME: (i32 inreg [[TMP0:%.*]], i64 [[TMP1:%.*]], <2 x float> [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_4 -; PRELOAD-3-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_4 -; PRELOAD-16-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr inreg [[TMP3:%.*]]) #[[ATTR0]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_4 -; PRELOAD-20-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr inreg [[TMP3:%.*]]) #[[ATTR0]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -define amdgpu_kernel void @test_preload_hint_kernel_18(i32 %0, i64 %1, <2 x float> %2, ptr %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, i32 %14, i32 %15, i32 %16, i32 %17) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18 -; NO-PRELOAD-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]], <2 x float> [[TMP2:%.*]], ptr [[TMP3:%.*]], i32 [[TMP4:%.*]], i32 [[TMP5:%.*]], i32 [[TMP6:%.*]], i32 [[TMP7:%.*]], i32 [[TMP8:%.*]], i32 [[TMP9:%.*]], i32 [[TMP10:%.*]], i32 [[TMP11:%.*]], i32 [[TMP12:%.*]], i32 [[TMP13:%.*]], i32 [[TMP14:%.*]], i32 [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18 -; PRELOAD-1-SAME: (i32 inreg [[TMP0:%.*]], i64 [[TMP1:%.*]], <2 x float> [[TMP2:%.*]], ptr [[TMP3:%.*]], i32 [[TMP4:%.*]], i32 [[TMP5:%.*]], i32 [[TMP6:%.*]], i32 [[TMP7:%.*]], i32 [[TMP8:%.*]], i32 [[TMP9:%.*]], i32 [[TMP10:%.*]], i32 [[TMP11:%.*]], i32 [[TMP12:%.*]], i32 [[TMP13:%.*]], i32 [[TMP14:%.*]], i32 [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18 -; PRELOAD-3-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr [[TMP3:%.*]], i32 [[TMP4:%.*]], i32 [[TMP5:%.*]], i32 [[TMP6:%.*]], i32 [[TMP7:%.*]], i32 [[TMP8:%.*]], i32 [[TMP9:%.*]], i32 [[TMP10:%.*]], i32 [[TMP11:%.*]], i32 [[TMP12:%.*]], i32 [[TMP13:%.*]], i32 [[TMP14:%.*]], i32 [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18 -; PRELOAD-16-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr inreg [[TMP3:%.*]], i32 inreg [[TMP4:%.*]], i32 inreg [[TMP5:%.*]], i32 inreg [[TMP6:%.*]], i32 inreg [[TMP7:%.*]], i32 inreg [[TMP8:%.*]], i32 inreg [[TMP9:%.*]], i32 inreg [[TMP10:%.*]], i32 inreg [[TMP11:%.*]], i32 inreg [[TMP12:%.*]], i32 inreg [[TMP13:%.*]], i32 inreg [[TMP14:%.*]], i32 inreg [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18 -; PRELOAD-20-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr inreg [[TMP3:%.*]], i32 inreg [[TMP4:%.*]], i32 inreg [[TMP5:%.*]], i32 inreg [[TMP6:%.*]], i32 inreg [[TMP7:%.*]], i32 inreg [[TMP8:%.*]], i32 inreg [[TMP9:%.*]], i32 inreg [[TMP10:%.*]], i32 inreg [[TMP11:%.*]], i32 inreg [[TMP12:%.*]], i32 inreg [[TMP13:%.*]], i32 inreg [[TMP14:%.*]], i32 inreg [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -define void @test_preload_hint_non_kernel_2(i32 %0, i64 %1) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_non_kernel_2 -; NO-PRELOAD-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_non_kernel_2 -; PRELOAD-1-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_non_kernel_2 -; PRELOAD-3-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_non_kernel_2 -; PRELOAD-16-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_non_kernel_2 -; PRELOAD-20-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -define amdgpu_kernel void @test_preload_hint_kernel_1_call_func(ptr %0) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_func -; NO-PRELOAD-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] { -; NO-PRELOAD-NEXT: call void @func(ptr [[TMP0]]) -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_func -; PRELOAD-1-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] { -; PRELOAD-1-NEXT: call void @func(ptr [[TMP0]]) -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_func -; PRELOAD-3-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] { -; PRELOAD-3-NEXT: call void @func(ptr [[TMP0]]) -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_func -; PRELOAD-16-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] { -; PRELOAD-16-NEXT: call void @func(ptr [[TMP0]]) -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_func -; PRELOAD-20-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] { -; PRELOAD-20-NEXT: call void @func(ptr [[TMP0]]) -; PRELOAD-20-NEXT: ret void -; - call void @func(ptr %0) - ret void -} - -define amdgpu_kernel void @test_preload_hint_kernel_1_call_intrinsic(i16 %0) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic -; NO-PRELOAD-SAME: (i16 [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { -; NO-PRELOAD-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]]) -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic -; PRELOAD-1-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { -; PRELOAD-1-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]]) -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic -; PRELOAD-3-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { -; PRELOAD-3-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]]) -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic -; PRELOAD-16-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { -; PRELOAD-16-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]]) -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic -; PRELOAD-20-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { -; PRELOAD-20-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]]) -; PRELOAD-20-NEXT: ret void -; - call void @llvm.amdgcn.set.prio(i16 %0) - ret void -} - -define spir_kernel void @test_preload_hint_kernel_1_spir_cc(ptr %0) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_spir_cc -; NO-PRELOAD-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_spir_cc -; PRELOAD-1-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_spir_cc -; PRELOAD-3-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_spir_cc -; PRELOAD-16-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_spir_cc -; PRELOAD-20-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -define amdgpu_kernel void @test_preload_hint_kernel_2_preexisting(i32 inreg %0, i64 %1) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2_preexisting -; NO-PRELOAD-SAME: (i32 inreg [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2_preexisting -; PRELOAD-1-SAME: (i32 inreg [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2_preexisting -; PRELOAD-3-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2_preexisting -; PRELOAD-16-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2_preexisting -; PRELOAD-20-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -define amdgpu_kernel void @test_preload_hint_kernel_incompatible_attributes(ptr addrspace(4) byref(i32) %0, ptr nest %1) { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes -; NO-PRELOAD-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes -; PRELOAD-1-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes -; PRELOAD-3-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes -; PRELOAD-16-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes -; PRELOAD-20-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -declare void @func(ptr) #0 -declare void @llvm.amdgcn.set.prio(i16) - -attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll index 7ae0c11dca279..41fe0d48ec819 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll @@ -927,18 +927,17 @@ define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) inr ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB23_0: -; GFX90a-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x20 +; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 ; GFX90a-NEXT: v_mov_b32_e32 v3, 0 ; GFX90a-NEXT: v_mov_b32_e32 v0, s10 ; GFX90a-NEXT: global_store_short v3, v0, s[8:9] +; GFX90a-NEXT: v_mov_b32_e32 v0, s15 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: v_mov_b32_e32 v0, s3 -; GFX90a-NEXT: global_store_short v3, v0, s[6:7] offset:12 -; GFX90a-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-NEXT: global_store_short v3, v0, s[0:1] offset:12 +; GFX90a-NEXT: v_mov_b32_e32 v2, s14 +; GFX90a-NEXT: v_mov_b32_e32 v0, s12 +; GFX90a-NEXT: v_mov_b32_e32 v1, s13 +; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX90a-NEXT: s_endpgm store half %in, ptr addrspace(1) %out store <7 x bfloat> %in2, ptr addrspace(1) %out2 @@ -1172,16 +1171,15 @@ define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) inreg % ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB29_0: -; GFX90a-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 ; GFX90a-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x20 ; GFX90a-NEXT: v_mov_b32_e32 v4, s10 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-NEXT: v_mov_b32_e32 v0, s12 +; GFX90a-NEXT: v_mov_b32_e32 v1, s13 +; GFX90a-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-NEXT: global_store_short v3, v4, s[8:9] -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] +; GFX90a-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX90a-NEXT: s_endpgm store i16 %in, ptr addrspace(1) %out store <3 x i32> %in2, ptr addrspace(1) %out2 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index 68010fc45b5a5..09d19be98a7ac 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -301,7 +301,7 @@ define hidden i32 @called(i32 %a) noinline { ret i32 %sub } -define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { +define amdgpu_kernel void @call(ptr addrspace(8) %tmp14, i32 %arg) { ; GFX9-O0-LABEL: call: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_mov_b32 s32, 0 @@ -533,7 +533,7 @@ define i64 @called_i64(i64 %a) noinline { ret i64 %sub } -define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %arg) { +define amdgpu_kernel void @call_i64(ptr addrspace(8) %tmp14, i64 %arg) { ; GFX9-O0-LABEL: call_i64: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_mov_b32 s32, 0 @@ -1153,7 +1153,7 @@ define hidden i32 @strict_wwm_called(i32 %a) noinline { ret i32 %sub } -define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { +define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) %tmp14, i32 %arg) { ; GFX9-O0-LABEL: strict_wwm_call: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_mov_b32 s32, 0 @@ -1385,7 +1385,7 @@ define i64 @strict_wwm_called_i64(i64 %a) noinline { ret i64 %sub } -define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %arg) { +define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) %tmp14, i64 %arg) { ; GFX9-O0-LABEL: strict_wwm_call_i64: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_mov_b32 s32, 0