diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 546db318c17d5..07e03bdc919ef 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -14,7 +14,9 @@ #include "GCNSubtarget.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/Analysis/CycleAnalysis.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" #include "llvm/Target/TargetMachine.h" @@ -144,6 +146,213 @@ static bool funcRequiresHostcallPtr(const Function &F) { } namespace { + +class PreloadKernelArgInfo { +private: + Function &F; + const GCNSubtarget &ST; + unsigned NumFreeUserSGPRs; + + enum HiddenArg : unsigned { + HIDDEN_BLOCK_COUNT_X, + HIDDEN_BLOCK_COUNT_Y, + HIDDEN_BLOCK_COUNT_Z, + HIDDEN_GROUP_SIZE_X, + HIDDEN_GROUP_SIZE_Y, + HIDDEN_GROUP_SIZE_Z, + HIDDEN_REMAINDER_X, + HIDDEN_REMAINDER_Y, + HIDDEN_REMAINDER_Z, + END_HIDDEN_ARGS + }; + + // Stores information about a specific hidden argument. + struct HiddenArgInfo { + // Offset in bytes from the location in the kernearg segment pointed to by + // the implicitarg pointer. + uint8_t Offset; + // The size of the hidden argument in bytes. + uint8_t Size; + // The name of the hidden argument in the kernel signature. + const char *Name; + }; + + static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = { + {0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"}, + {8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"}, + {14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"}, + {18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"}, + {22, 2, "_hidden_remainder_z"}}; + + static HiddenArg getHiddenArgFromOffset(unsigned Offset) { + for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I) + if (HiddenArgs[I].Offset == Offset) + return static_cast(I); + + return END_HIDDEN_ARGS; + } + + static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) { + if (HA < END_HIDDEN_ARGS) + return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8); + + llvm_unreachable("Unexpected hidden argument."); + } + + static const char *getHiddenArgName(HiddenArg HA) { + if (HA < END_HIDDEN_ARGS) { + return HiddenArgs[HA].Name; + } + llvm_unreachable("Unexpected hidden argument."); + } + + // Clones the function after adding implicit arguments to the argument list + // and returns the new updated function. Preloaded implicit arguments are + // added up to and including the last one that will be preloaded, indicated by + // LastPreloadIndex. Currently preloading is only performed on the totality of + // sequential data from the kernarg segment including implicit (hidden) + // arguments. This means that all arguments up to the last preloaded argument + // will also be preloaded even if that data is unused. + Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) { + FunctionType *FT = F.getFunctionType(); + LLVMContext &Ctx = F.getParent()->getContext(); + SmallVector FTypes(FT->param_begin(), FT->param_end()); + for (unsigned I = 0; I <= LastPreloadIndex; ++I) + FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I))); + + FunctionType *NFT = + FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg()); + Function *NF = + Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName()); + + NF->copyAttributesFrom(&F); + NF->copyMetadata(&F, 0); + NF->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat); + + F.getParent()->getFunctionList().insert(F.getIterator(), NF); + NF->takeName(&F); + NF->splice(NF->begin(), &F); + + Function::arg_iterator NFArg = NF->arg_begin(); + for (Argument &Arg : F.args()) { + Arg.replaceAllUsesWith(&*NFArg); + NFArg->takeName(&Arg); + ++NFArg; + } + + AttrBuilder AB(Ctx); + AB.addAttribute(Attribute::InReg); + AB.addAttribute("amdgpu-hidden-argument"); + AttributeList AL = NF->getAttributes(); + for (unsigned I = 0; I <= LastPreloadIndex; ++I) { + AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB); + NFArg++->setName(getHiddenArgName(HiddenArg(I))); + } + + NF->setAttributes(AL); + F.replaceAllUsesWith(NF); + + return NF; + } + +public: + PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) { + setInitialFreeUserSGPRsCount(); + } + + // Returns the maximum number of user SGPRs that we have available to preload + // arguments. + void setInitialFreeUserSGPRsCount() { + GCNUserSGPRUsageInfo UserSGPRInfo(F, ST); + NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs(); + } + + bool canPreloadKernArgAtOffset(uint64_t ExplicitArgOffset) { + return ExplicitArgOffset <= NumFreeUserSGPRs * 4; + } + + // Try to allocate SGPRs to preload hidden kernel arguments. + void + tryAllocHiddenArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset, + SmallVectorImpl &FunctionsToErase) { + Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists( + F.getParent(), Intrinsic::amdgcn_implicitarg_ptr); + if (!ImplicitArgPtr) + return; + + const DataLayout &DL = F.getParent()->getDataLayout(); + // Pair is the load and the load offset. + SmallVector, 4> ImplicitArgLoads; + for (auto *U : ImplicitArgPtr->users()) { + Instruction *CI = dyn_cast(U); + if (!CI || CI->getParent()->getParent() != &F) + continue; + + for (auto *U : CI->users()) { + int64_t Offset = 0; + auto *Load = dyn_cast(U); // Load from ImplicitArgPtr? + if (!Load) { + if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI) + continue; + + Load = dyn_cast(*U->user_begin()); // Load from GEP? + } + + if (!Load || !Load->isSimple()) + continue; + + // FIXME: Expand handle merged loads. + LLVMContext &Ctx = F.getParent()->getContext(); + Type *LoadTy = Load->getType(); + HiddenArg HA = getHiddenArgFromOffset(Offset); + if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA)) + continue; + + ImplicitArgLoads.push_back(std::make_pair(Load, Offset)); + } + } + + if (ImplicitArgLoads.empty()) + return; + + // Allocate loads in order of offset. We need to be sure that the implicit + // argument can actually be preloaded. + std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second()); + + // If we fail to preload any implicit argument we know we don't have SGPRs + // to preload any subsequent ones with larger offsets. Find the first + // argument that we cannot preload. + auto *PreloadEnd = + std::find_if(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), + [&](const std::pair &Load) { + unsigned LoadSize = + DL.getTypeStoreSize(Load.first->getType()); + unsigned LoadOffset = Load.second; + if (!canPreloadKernArgAtOffset(LoadOffset + LoadSize + + ImplicitArgsBaseOffset)) + return true; + + return false; + }); + + if (PreloadEnd == ImplicitArgLoads.begin()) + return; + + unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second); + Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex); + assert(NF); + FunctionsToErase.push_back(&F); + for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) { + LoadInst *LoadInst = I->first; + unsigned LoadOffset = I->second; + unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset); + unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1; + Argument *Arg = NF->getArg(Index); + LoadInst->replaceAllUsesWith(Arg); + } + } +}; + class AMDGPUInformationCache : public InformationCache { public: AMDGPUInformationCache(const Module &M, AnalysisGetter &AG, @@ -1314,19 +1523,66 @@ struct AAAMDGPUNoAGPR const char AAAMDGPUNoAGPR::ID = 0; -static void addPreloadKernArgHint(Function &F, TargetMachine &TM) { - const GCNSubtarget &ST = TM.getSubtarget(F); - for (unsigned I = 0; - I < F.arg_size() && - I < std::min(KernargPreloadCount.getValue(), ST.getMaxNumUserSGPRs()); - ++I) { - Argument &Arg = *F.getArg(I); - // Check for incompatible attributes. - if (Arg.hasByRefAttr() || Arg.hasNestAttr()) - break; +static void markKernelArgsAsInreg(SetVector &Functions, + TargetMachine &TM) { + SmallVector FunctionsToErase; + for (auto *F : Functions) { + const GCNSubtarget &ST = TM.getSubtarget(*F); + if (!ST.hasKernargPreload() || + F->getCallingConv() != CallingConv::AMDGPU_KERNEL) + continue; + + PreloadKernelArgInfo PreloadInfo(*F, ST); + uint64_t ExplicitArgOffset = 0; + const DataLayout &DL = F->getDataLayout(); + const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(); + unsigned NumPreloadsRequested = KernargPreloadCount; + unsigned NumPreloadedExplicitArgs = 0; + for (Argument &Arg : F->args()) { + // Avoid incompatible attributes and guard against running this pass + // twice. + // + // TODO: Preload byref kernel arguments + if (Arg.hasByRefAttr() || Arg.hasNestAttr() || + Arg.hasAttribute("amdgpu-hidden-argument")) + break; + + // Inreg may be pre-existing on some arguments, try to preload these. + if (NumPreloadsRequested == 0 && !Arg.hasInRegAttr()) + break; + + // FIXME: Preload aggregates. + if (Arg.getType()->isAggregateType()) + break; + + Type *ArgTy = Arg.getType(); + Align ABITypeAlign = DL.getABITypeAlign(ArgTy); + uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); + ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize; + if (!PreloadInfo.canPreloadKernArgAtOffset(ExplicitArgOffset)) + break; + + Arg.addAttr(Attribute::InReg); + NumPreloadedExplicitArgs++; + if (NumPreloadsRequested > 0) + NumPreloadsRequested--; + } - Arg.addAttr(Attribute::InReg); + // Only try preloading hidden arguments if we can successfully preload the + // last explicit argument. + if (NumPreloadedExplicitArgs == F->arg_size()) { + uint64_t ImplicitArgsBaseOffset = + alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) + + BaseOffset; + PreloadInfo.tryAllocHiddenArgPreloadSGPRs(ImplicitArgsBaseOffset, + FunctionsToErase); + } } + + // Erase cloned functions if we needed to update the kernel signature to + // support preloading hidden kernel arguments. + for (auto *F : FunctionsToErase) + F->eraseFromParent(); } static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, @@ -1378,8 +1634,6 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, if (!AMDGPU::isEntryFunctionCC(CC)) { A.getOrCreateAAFor(IRPosition::function(*F)); A.getOrCreateAAFor(IRPosition::function(*F)); - } else if (CC == CallingConv::AMDGPU_KERNEL) { - addPreloadKernArgHint(*F, TM); } for (auto &I : instructions(F)) { @@ -1400,6 +1654,11 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, } ChangeStatus Change = A.run(); + + // Mark kernel arguments with 'inreg' attribute to indicate that they should + // be preloaded into SGPRs. + markKernelArgsAsInreg(Functions, TM); + return Change == ChangeStatus::CHANGED; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index e9d009baa20af..7f6c5b4b476d8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -27,230 +27,6 @@ using namespace llvm; namespace { -class PreloadKernelArgInfo { -private: - Function &F; - const GCNSubtarget &ST; - unsigned NumFreeUserSGPRs; - - enum HiddenArg : unsigned { - HIDDEN_BLOCK_COUNT_X, - HIDDEN_BLOCK_COUNT_Y, - HIDDEN_BLOCK_COUNT_Z, - HIDDEN_GROUP_SIZE_X, - HIDDEN_GROUP_SIZE_Y, - HIDDEN_GROUP_SIZE_Z, - HIDDEN_REMAINDER_X, - HIDDEN_REMAINDER_Y, - HIDDEN_REMAINDER_Z, - END_HIDDEN_ARGS - }; - - // Stores information about a specific hidden argument. - struct HiddenArgInfo { - // Offset in bytes from the location in the kernearg segment pointed to by - // the implicitarg pointer. - uint8_t Offset; - // The size of the hidden argument in bytes. - uint8_t Size; - // The name of the hidden argument in the kernel signature. - const char *Name; - }; - - static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = { - {0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"}, - {8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"}, - {14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"}, - {18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"}, - {22, 2, "_hidden_remainder_z"}}; - - static HiddenArg getHiddenArgFromOffset(unsigned Offset) { - for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I) - if (HiddenArgs[I].Offset == Offset) - return static_cast(I); - - return END_HIDDEN_ARGS; - } - - static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) { - if (HA < END_HIDDEN_ARGS) - return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8); - - llvm_unreachable("Unexpected hidden argument."); - } - - static const char *getHiddenArgName(HiddenArg HA) { - if (HA < END_HIDDEN_ARGS) { - return HiddenArgs[HA].Name; - } - llvm_unreachable("Unexpected hidden argument."); - } - - // Clones the function after adding implicit arguments to the argument list - // and returns the new updated function. Preloaded implicit arguments are - // added up to and including the last one that will be preloaded, indicated by - // LastPreloadIndex. Currently preloading is only performed on the totality of - // sequential data from the kernarg segment including implicit (hidden) - // arguments. This means that all arguments up to the last preloaded argument - // will also be preloaded even if that data is unused. - Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) { - FunctionType *FT = F.getFunctionType(); - LLVMContext &Ctx = F.getParent()->getContext(); - SmallVector FTypes(FT->param_begin(), FT->param_end()); - for (unsigned I = 0; I <= LastPreloadIndex; ++I) - FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I))); - - FunctionType *NFT = - FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg()); - Function *NF = - Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName()); - - NF->copyAttributesFrom(&F); - NF->copyMetadata(&F, 0); - NF->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat); - - F.getParent()->getFunctionList().insert(F.getIterator(), NF); - NF->takeName(&F); - NF->splice(NF->begin(), &F); - - Function::arg_iterator NFArg = NF->arg_begin(); - for (Argument &Arg : F.args()) { - Arg.replaceAllUsesWith(&*NFArg); - NFArg->takeName(&Arg); - ++NFArg; - } - - AttrBuilder AB(Ctx); - AB.addAttribute(Attribute::InReg); - AB.addAttribute("amdgpu-hidden-argument"); - AttributeList AL = NF->getAttributes(); - for (unsigned I = 0; I <= LastPreloadIndex; ++I) { - AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB); - NFArg++->setName(getHiddenArgName(HiddenArg(I))); - } - - NF->setAttributes(AL); - F.replaceAllUsesWith(NF); - F.setCallingConv(CallingConv::C); - - return NF; - } - -public: - PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) { - setInitialFreeUserSGPRsCount(); - } - - // Returns the maximum number of user SGPRs that we have available to preload - // arguments. - void setInitialFreeUserSGPRsCount() { - GCNUserSGPRUsageInfo UserSGPRInfo(F, ST); - NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs(); - } - - bool tryAllocPreloadSGPRs(unsigned AllocSize, uint64_t ArgOffset, - uint64_t LastExplicitArgOffset) { - // Check if this argument may be loaded into the same register as the - // previous argument. - if (ArgOffset - LastExplicitArgOffset < 4 && - !isAligned(Align(4), ArgOffset)) - return true; - - // Pad SGPRs for kernarg alignment. - ArgOffset = alignDown(ArgOffset, 4); - unsigned Padding = ArgOffset - LastExplicitArgOffset; - unsigned PaddingSGPRs = alignTo(Padding, 4) / 4; - unsigned NumPreloadSGPRs = alignTo(AllocSize, 4) / 4; - if (NumPreloadSGPRs + PaddingSGPRs > NumFreeUserSGPRs) - return false; - - NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs); - return true; - } - - // Try to allocate SGPRs to preload implicit kernel arguments. - void tryAllocImplicitArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset, - uint64_t LastExplicitArgOffset, - IRBuilder<> &Builder) { - Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists( - F.getParent(), Intrinsic::amdgcn_implicitarg_ptr); - if (!ImplicitArgPtr) - return; - - const DataLayout &DL = F.getParent()->getDataLayout(); - // Pair is the load and the load offset. - SmallVector, 4> ImplicitArgLoads; - for (auto *U : ImplicitArgPtr->users()) { - Instruction *CI = dyn_cast(U); - if (!CI || CI->getParent()->getParent() != &F) - continue; - - for (auto *U : CI->users()) { - int64_t Offset = 0; - auto *Load = dyn_cast(U); // Load from ImplicitArgPtr? - if (!Load) { - if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI) - continue; - - Load = dyn_cast(*U->user_begin()); // Load from GEP? - } - - if (!Load || !Load->isSimple()) - continue; - - // FIXME: Expand to handle 64-bit implicit args and large merged loads. - LLVMContext &Ctx = F.getParent()->getContext(); - Type *LoadTy = Load->getType(); - HiddenArg HA = getHiddenArgFromOffset(Offset); - if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA)) - continue; - - ImplicitArgLoads.push_back(std::make_pair(Load, Offset)); - } - } - - if (ImplicitArgLoads.empty()) - return; - - // Allocate loads in order of offset. We need to be sure that the implicit - // argument can actually be preloaded. - std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second()); - - // If we fail to preload any implicit argument we know we don't have SGPRs - // to preload any subsequent ones with larger offsets. Find the first - // argument that we cannot preload. - auto *PreloadEnd = std::find_if( - ImplicitArgLoads.begin(), ImplicitArgLoads.end(), - [&](const std::pair &Load) { - unsigned LoadSize = DL.getTypeStoreSize(Load.first->getType()); - unsigned LoadOffset = Load.second; - if (!tryAllocPreloadSGPRs(LoadSize, - LoadOffset + ImplicitArgsBaseOffset, - LastExplicitArgOffset)) - return true; - - LastExplicitArgOffset = - ImplicitArgsBaseOffset + LoadOffset + LoadSize; - return false; - }); - - if (PreloadEnd == ImplicitArgLoads.begin()) - return; - - unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second); - Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex); - assert(NF); - for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) { - LoadInst *LoadInst = I->first; - unsigned LoadOffset = I->second; - unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset); - unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1; - Argument *Arg = NF->getArg(Index); - LoadInst->replaceAllUsesWith(Arg); - } - } -}; - class AMDGPULowerKernelArguments : public FunctionPass { public: static char ID; @@ -310,10 +86,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize)); uint64_t ExplicitArgOffset = 0; - // Preloaded kernel arguments must be sequential. - bool InPreloadSequence = true; - PreloadKernelArgInfo PreloadInfo(F, ST); - for (Argument &Arg : F.args()) { const bool IsByRef = Arg.hasByRefAttr(); Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); @@ -324,25 +96,10 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset; - uint64_t LastExplicitArgOffset = ExplicitArgOffset; ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize; - // Guard against the situation where hidden arguments have already been - // lowered and added to the kernel function signiture, i.e. in a situation - // where this pass has run twice. - if (Arg.hasAttribute("amdgpu-hidden-argument")) - break; - - // Try to preload this argument into user SGPRs. - if (Arg.hasInRegAttr() && InPreloadSequence && ST.hasKernargPreload() && - !Arg.getType()->isAggregateType()) - if (PreloadInfo.tryAllocPreloadSGPRs(AllocSize, EltOffset, - LastExplicitArgOffset)) - continue; - - InPreloadSequence = false; - - if (Arg.use_empty()) + // Inreg arguments should be preloaded. + if (Arg.use_empty() || Arg.hasInRegAttr()) continue; // If this is byval, the loads are already explicit in the function. We just @@ -482,14 +239,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { KernArgSegment->addRetAttr( Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign))); - if (InPreloadSequence) { - uint64_t ImplicitArgsBaseOffset = - alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) + - BaseOffset; - PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset, - ExplicitArgOffset, Builder); - } - return true; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll index b26ddbdd7a342..5cba777959d8b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -625,7 +625,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node ret <4 x float> %r } -define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { +define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 @@ -739,7 +739,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ret void } -define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { +define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 @@ -843,7 +843,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ret void } -define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) { +define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> %tdescr) { ; GFX10-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 @@ -925,7 +925,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ret void } -define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) { +define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> %tdescr) { ; GFX10-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd-with-ret.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd-with-ret.ll index 798a3ee1d75fd..076c12e0d5bc7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd-with-ret.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd-with-ret.ll @@ -8,7 +8,7 @@ declare <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half>, ptr ; GFX90A-LABEL: {{^}}buffer_atomic_add_f32_rtn: ; GFX90A: buffer_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}], s{{[0-9]+}} offen glc -define amdgpu_kernel void @buffer_atomic_add_f32_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 %soffset) { +define amdgpu_kernel void @buffer_atomic_add_f32_rtn(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset) { main_body: %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) store float %ret, ptr undef @@ -17,7 +17,7 @@ main_body: ; GFX90A-LABEL: {{^}}buffer_atomic_add_v2f16_rtn: ; GFX90A: buffer_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}], s{{[0-9]+}} offen glc -define amdgpu_kernel void @buffer_atomic_add_v2f16_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +define amdgpu_kernel void @buffer_atomic_add_v2f16_rtn(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset) { main_body: %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) store <2 x half> %ret, ptr undef diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll index 8af5db9f62908..513ffb38fe7f9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -261,7 +261,7 @@ main_body: ; TODO: NSA reassign is very limited and cannot work with VGPR tuples and subregs. -define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { +define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> %tdescr) { ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body ; GFX1013-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 @@ -425,7 +425,7 @@ main_body: ret void } -define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { +define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> %tdescr) { ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body ; GFX1013-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 @@ -573,7 +573,7 @@ main_body: ret void } -define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) { +define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> %tdescr) { ; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body ; GFX1013-NEXT: s_clause 0x1 @@ -734,7 +734,7 @@ main_body: ret void } -define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) { +define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> %tdescr) { ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body ; GFX1013-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll index 7342c366799e9..68bc20456be6a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -1486,7 +1486,7 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ret void } -define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 { +define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 %oldval, ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_i64: ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 @@ -1583,7 +1583,7 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr ret void } -define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ptr addrspace(1) %out, double %src0, i32 %src1) #1 { +define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double %oldval, ptr addrspace(1) %out, double %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_f64: ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll index aeb7faade4715..ff5d5c8dea567 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll @@ -1,21 +1,66 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -S < %s | FileCheck -check-prefix=NO-PRELOAD %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=16 -S < %s | FileCheck -check-prefix=PRELOAD %s +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -S < %s | FileCheck -check-prefix=NO-PRELOAD %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-kernarg-preload-count=100 -S < %s | FileCheck -check-prefix=PRELOAD %s + + +define amdgpu_kernel void @incompatible_attribute_block_count_x(ptr addrspace(1) %out, ptr addrspace(1) byref(i32) %arg) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@incompatible_attribute_block_count_x +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) byref(i32) [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[LOAD0:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 +; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[ARG]], align 4 +; NO-PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[LOAD0]], [[LOAD1]] +; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@incompatible_attribute_block_count_x +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) byref(i32) [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[LOAD0:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 +; PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[ARG]], align 4 +; PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[LOAD0]], [[LOAD1]] +; PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %load0 = load i32, ptr addrspace(4) %imp_arg_ptr + %load1 = load i32, ptr addrspace(1) %arg + %add = add i32 %load0, %load1 + store i32 %add, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preload_aggregate_arg_block_count_x(ptr addrspace(1) %out, { i32, i32 } inreg) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_aggregate_arg_block_count_x +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], { i32, i32 } inreg [[TMP0:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@preload_aggregate_arg_block_count_x +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], { i32, i32 } inreg [[TMP0:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 +; PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %load = load i32, ptr addrspace(4) %imp_arg_ptr + store i32 %load, ptr addrspace(1) %out + ret void +} define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) %out) { -; NO-PRELOAD-LABEL: define amdgpu_kernel void @preload_block_count_x( -; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { -; NO-PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0 -; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0:![0-9]+]] +; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_x +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { ; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 -; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 ; NO-PRELOAD-NEXT: ret void ; -; PRELOAD-LABEL: define amdgpu_kernel void @preload_block_count_x( -; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0:[0-9]+]] { -; PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_x +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0]] { ; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 ; PRELOAD-NEXT: store i32 [[_HIDDEN_BLOCK_COUNT_X]], ptr addrspace(1) [[OUT]], align 4 @@ -27,20 +72,37 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) %out) { ret void } -define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, i512) { -; NO-PRELOAD-LABEL: define amdgpu_kernel void @no_free_sgprs_block_count_x( -; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]], i512 [[TMP0:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(328) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0 -; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) %out, i32 inreg) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_unused_arg_block_count_x +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 inreg [[TMP0:%.*]]) #[[ATTR0]] { ; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 -; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 ; NO-PRELOAD-NEXT: ret void ; -; PRELOAD-LABEL: define amdgpu_kernel void @no_free_sgprs_block_count_x( -; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i512 inreg [[TMP0:%.*]]) #[[ATTR0]] { -; PRELOAD-NEXT: [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(328) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-LABEL: define {{[^@]+}}@preload_unused_arg_block_count_x +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg [[TMP0:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 +; PRELOAD-NEXT: store i32 [[_HIDDEN_BLOCK_COUNT_X]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %load = load i32, ptr addrspace(4) %imp_arg_ptr + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, <16 x i32> inreg) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@no_free_sgprs_block_count_x +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <16 x i32> inreg [[TMP0:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@no_free_sgprs_block_count_x +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <16 x i32> inreg [[TMP0:%.*]]) #[[ATTR0]] { ; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 ; PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 @@ -52,31 +114,288 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, i5 ret void } -define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) %out) { -; NO-PRELOAD-LABEL: define amdgpu_kernel void @preloadremainder_z( -; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: [[PRELOADREMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOADREMAINDER_Z_KERNARG_SEGMENT]], i64 0 -; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32 inreg, i32) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@mixed_inreg_block_count_x +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 inreg [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { ; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@mixed_inreg_block_count_x +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg [[TMP0:%.*]], i32 inreg [[TMP1:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 +; PRELOAD-NEXT: store i32 [[_HIDDEN_BLOCK_COUNT_X]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %load = load i32, ptr addrspace(4) %imp_arg_ptr + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@incorrect_type_i64_block_count_x +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(4) [[IMP_ARG_PTR]], align 8 +; NO-PRELOAD-NEXT: store i64 [[LOAD]], ptr addrspace(1) [[OUT]], align 8 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@incorrect_type_i64_block_count_x +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(4) [[IMP_ARG_PTR]], align 8 +; PRELOAD-NEXT: store i64 [[LOAD]], ptr addrspace(1) [[OUT]], align 8 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %load = load i64, ptr addrspace(4) %imp_arg_ptr + store i64 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@incorrect_type_i16_block_count_x +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[IMP_ARG_PTR]], align 2 +; NO-PRELOAD-NEXT: store i16 [[LOAD]], ptr addrspace(1) [[OUT]], align 2 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@incorrect_type_i16_block_count_x +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[IMP_ARG_PTR]], align 2 +; PRELOAD-NEXT: store i16 [[LOAD]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %load = load i16, ptr addrspace(4) %imp_arg_ptr + store i16 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_y +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 4 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_y +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 4 +; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4 +; PRELOAD-NEXT: store i32 [[_HIDDEN_BLOCK_COUNT_Y]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4 + %load = load i32, ptr addrspace(4) %gep + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@random_incorrect_offset +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 2 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@random_incorrect_offset +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 2 +; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4 +; PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2 + %load = load i32, ptr addrspace(4) %gep + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_z +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 8 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_z +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 8 +; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4 +; PRELOAD-NEXT: store i32 [[_HIDDEN_BLOCK_COUNT_Z]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8 + %load = load i32, ptr addrspace(4) %gep + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspace(1) %out, i8 %val) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_x_imparg_align_ptr_i8 +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i8 [[VAL:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 +; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i8 [[VAL]] to i32 +; NO-PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]] +; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_x_imparg_align_ptr_i8 +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i8 inreg [[VAL:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 +; PRELOAD-NEXT: [[EXT:%.*]] = zext i8 [[VAL]] to i32 +; PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[_HIDDEN_BLOCK_COUNT_X]], [[EXT]] +; PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %load = load i32, ptr addrspace(4) %imp_arg_ptr + %ext = zext i8 %val to i32 + %add = add i32 %load, %ext + store i32 %add, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_xyz +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 0 +; NO-PRELOAD-NEXT: [[LOAD_X:%.*]] = load i32, ptr addrspace(4) [[GEP_X]], align 4 +; NO-PRELOAD-NEXT: [[GEP_Y:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 4 +; NO-PRELOAD-NEXT: [[LOAD_Y:%.*]] = load i32, ptr addrspace(4) [[GEP_Y]], align 4 +; NO-PRELOAD-NEXT: [[GEP_Z:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 8 +; NO-PRELOAD-NEXT: [[LOAD_Z:%.*]] = load i32, ptr addrspace(4) [[GEP_Z]], align 4 +; NO-PRELOAD-NEXT: [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[LOAD_X]], i32 0 +; NO-PRELOAD-NEXT: [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[LOAD_Y]], i32 1 +; NO-PRELOAD-NEXT: [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[LOAD_Z]], i32 2 +; NO-PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT]], align 16 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_xyz +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 0 +; PRELOAD-NEXT: [[LOAD_X:%.*]] = load i32, ptr addrspace(4) [[GEP_X]], align 4 +; PRELOAD-NEXT: [[GEP_Y:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 4 +; PRELOAD-NEXT: [[LOAD_Y:%.*]] = load i32, ptr addrspace(4) [[GEP_Y]], align 4 +; PRELOAD-NEXT: [[GEP_Z:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 8 +; PRELOAD-NEXT: [[LOAD_Z:%.*]] = load i32, ptr addrspace(4) [[GEP_Z]], align 4 +; PRELOAD-NEXT: [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[_HIDDEN_BLOCK_COUNT_X]], i32 0 +; PRELOAD-NEXT: [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[_HIDDEN_BLOCK_COUNT_Y]], i32 1 +; PRELOAD-NEXT: [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[_HIDDEN_BLOCK_COUNT_Z]], i32 2 +; PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT]], align 16 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 0 + %load_x = load i32, ptr addrspace(4) %gep_x + %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4 + %load_y = load i32, ptr addrspace(4) %gep_y + %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8 + %load_z = load i32, ptr addrspace(4) %gep_z + %ins.0 = insertelement <3 x i32> poison, i32 %load_x, i32 0 + %ins.1 = insertelement <3 x i32> %ins.0, i32 %load_y, i32 1 + %ins.2 = insertelement <3 x i32> %ins.1, i32 %load_z, i32 2 + store <3 x i32> %ins.2, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_x +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 12 ; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 ; NO-PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[LOAD]] to i32 -; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4 ; NO-PRELOAD-NEXT: ret void ; -; PRELOAD-LABEL: define amdgpu_kernel void @preloadremainder_z( -; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Z:%.*]]) #[[ATTR0]] { -; PRELOAD-NEXT: [[PRELOADREMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_x +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]]) #[[ATTR0]] { ; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22 +; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 12 ; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 -; PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[_HIDDEN_REMAINDER_Z]] to i32 +; PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[_HIDDEN_GROUP_SIZE_X]] to i32 ; PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4 ; PRELOAD-NEXT: ret void ; %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() - %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12 + %load = load i16, ptr addrspace(4) %gep + %conv = zext i16 %load to i32 + store i32 %conv, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_y +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 14 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 +; NO-PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[LOAD]] to i32 +; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_y +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 14 +; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 +; PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[_HIDDEN_GROUP_SIZE_Y]] to i32 +; PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14 + %load = load i16, ptr addrspace(4) %gep + %conv = zext i16 %load to i32 + store i32 %conv, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_z +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 16 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 +; NO-PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[LOAD]] to i32 +; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_z +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 16 +; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 +; PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[_HIDDEN_GROUP_SIZE_Z]] to i32 +; PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16 %load = load i16, ptr addrspace(4) %gep %conv = zext i16 %load to i32 store i32 %conv, ptr addrspace(1) %out @@ -84,11 +403,8 @@ define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) %out) { } define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) %out) { -; NO-PRELOAD-LABEL: define amdgpu_kernel void @preload_workgroup_size_xyz( -; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: [[PRELOAD_WORKGROUP_SIZE_XYZ_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_WORKGROUP_SIZE_XYZ_KERNARG_SEGMENT]], i64 0 -; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_xyz +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { ; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; NO-PRELOAD-NEXT: [[GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 12 ; NO-PRELOAD-NEXT: [[LOAD_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 2 @@ -102,12 +418,11 @@ define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) %out) { ; NO-PRELOAD-NEXT: [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[CONV_X]], i32 0 ; NO-PRELOAD-NEXT: [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[CONV_Y]], i32 1 ; NO-PRELOAD-NEXT: [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[CONV_Z]], i32 2 -; NO-PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT_LOAD]], align 16 +; NO-PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT]], align 16 ; NO-PRELOAD-NEXT: ret void ; -; PRELOAD-LABEL: define amdgpu_kernel void @preload_workgroup_size_xyz( -; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]]) #[[ATTR0]] { -; PRELOAD-NEXT: [[PRELOAD_WORKGROUP_SIZE_XYZ_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_xyz +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]]) #[[ATTR0]] { ; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; PRELOAD-NEXT: [[GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 12 ; PRELOAD-NEXT: [[LOAD_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 2 @@ -141,74 +456,182 @@ define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) %out) { ret void } -define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inreg %out) { -; NO-PRELOAD-LABEL: define amdgpu_kernel void @incorrect_type_i64_block_count_x( -; NO-PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: [[INCORRECT_TYPE_I64_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_remainder_x +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { ; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(4) [[IMP_ARG_PTR]], align 8 -; NO-PRELOAD-NEXT: store i64 [[LOAD]], ptr addrspace(1) [[OUT]], align 8 +; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 18 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 +; NO-PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[LOAD]] to i32 +; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4 ; NO-PRELOAD-NEXT: ret void ; -; PRELOAD-LABEL: define amdgpu_kernel void @incorrect_type_i64_block_count_x( -; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { -; PRELOAD-NEXT: [[INCORRECT_TYPE_I64_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-LABEL: define {{[^@]+}}@preload_remainder_x +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]]) #[[ATTR0]] { ; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; PRELOAD-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(4) [[IMP_ARG_PTR]], align 8 -; PRELOAD-NEXT: store i64 [[LOAD]], ptr addrspace(1) [[OUT]], align 8 +; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 18 +; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 +; PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[_HIDDEN_REMAINDER_X]] to i32 +; PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4 ; PRELOAD-NEXT: ret void ; %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() - %load = load i64, ptr addrspace(4) %imp_arg_ptr - store i64 %load, ptr addrspace(1) %out + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18 + %load = load i16, ptr addrspace(4) %gep + %conv = zext i16 %load to i32 + store i32 %conv, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) { -; NO-PRELOAD-LABEL: define amdgpu_kernel void @random_incorrect_offset( -; NO-PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: [[RANDOM_INCORRECT_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preloadremainder_y +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { ; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 2 -; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4 -; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 20 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 +; NO-PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[LOAD]] to i32 +; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4 ; NO-PRELOAD-NEXT: ret void ; -; PRELOAD-LABEL: define amdgpu_kernel void @random_incorrect_offset( -; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { -; PRELOAD-NEXT: [[RANDOM_INCORRECT_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-LABEL: define {{[^@]+}}@preloadremainder_y +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Y:%.*]]) #[[ATTR0]] { ; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 2 -; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4 -; PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 20 +; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 +; PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[_HIDDEN_REMAINDER_Y]] to i32 +; PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4 ; PRELOAD-NEXT: ret void ; %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() - %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2 - %load = load i32, ptr addrspace(4) %gep - store i32 %load, ptr addrspace(1) %out + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20 + %load = load i16, ptr addrspace(4) %gep + %conv = zext i16 %load to i32 + store i32 %conv, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @incompatible_attribute_block_count_x(ptr addrspace(1) byref(i32) %out) { -; NO-PRELOAD-LABEL: define amdgpu_kernel void @incompatible_attribute_block_count_x( -; NO-PRELOAD-SAME: ptr addrspace(1) byref(i32) [[OUT:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; NO-PRELOAD-NEXT: [[OUT_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0 -; NO-PRELOAD-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[OUT_BYVAL_KERNARG_OFFSET]] to ptr addrspace(1) +define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preloadremainder_z +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 +; NO-PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[LOAD]] to i32 +; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@preloadremainder_z +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Z:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22 +; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 +; PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[_HIDDEN_REMAINDER_Z]] to i32 +; PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 + %load = load i16, ptr addrspace(4) %gep + %conv = zext i16 %load to i32 + store i32 %conv, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preloadremainder_xyz +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 18 +; NO-PRELOAD-NEXT: [[LOAD_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 2 +; NO-PRELOAD-NEXT: [[CONV_X:%.*]] = zext i16 [[LOAD_X]] to i32 +; NO-PRELOAD-NEXT: [[GEP_Y:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 20 +; NO-PRELOAD-NEXT: [[LOAD_Y:%.*]] = load i16, ptr addrspace(4) [[GEP_Y]], align 2 +; NO-PRELOAD-NEXT: [[CONV_Y:%.*]] = zext i16 [[LOAD_Y]] to i32 +; NO-PRELOAD-NEXT: [[GEP_Z:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22 +; NO-PRELOAD-NEXT: [[LOAD_Z:%.*]] = load i16, ptr addrspace(4) [[GEP_Z]], align 2 +; NO-PRELOAD-NEXT: [[CONV_Z:%.*]] = zext i16 [[LOAD_Z]] to i32 +; NO-PRELOAD-NEXT: [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[CONV_X]], i32 0 +; NO-PRELOAD-NEXT: [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[CONV_Y]], i32 1 +; NO-PRELOAD-NEXT: [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[CONV_Z]], i32 2 +; NO-PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT]], align 16 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@preloadremainder_xyz +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Z:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 18 +; PRELOAD-NEXT: [[LOAD_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 2 +; PRELOAD-NEXT: [[CONV_X:%.*]] = zext i16 [[_HIDDEN_REMAINDER_X]] to i32 +; PRELOAD-NEXT: [[GEP_Y:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 20 +; PRELOAD-NEXT: [[LOAD_Y:%.*]] = load i16, ptr addrspace(4) [[GEP_Y]], align 2 +; PRELOAD-NEXT: [[CONV_Y:%.*]] = zext i16 [[_HIDDEN_REMAINDER_Y]] to i32 +; PRELOAD-NEXT: [[GEP_Z:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22 +; PRELOAD-NEXT: [[LOAD_Z:%.*]] = load i16, ptr addrspace(4) [[GEP_Z]], align 2 +; PRELOAD-NEXT: [[CONV_Z:%.*]] = zext i16 [[_HIDDEN_REMAINDER_Z]] to i32 +; PRELOAD-NEXT: [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[CONV_X]], i32 0 +; PRELOAD-NEXT: [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[CONV_Y]], i32 1 +; PRELOAD-NEXT: [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[CONV_Z]], i32 2 +; PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT]], align 16 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18 + %load_x = load i16, ptr addrspace(4) %gep_x + %conv_x = zext i16 %load_x to i32 + %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20 + %load_y = load i16, ptr addrspace(4) %gep_y + %conv_y = zext i16 %load_y to i32 + %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 + %load_z = load i16, ptr addrspace(4) %gep_z + %conv_z = zext i16 %load_z to i32 + %ins.0 = insertelement <3 x i32> poison, i32 %conv_x, i32 0 + %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1 + %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2 + store <3 x i32> %ins.2, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@no_free_sgprs_preloadremainder_z +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 +; NO-PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[LOAD]] to i32 +; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@no_free_sgprs_preloadremainder_z +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Z:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22 +; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 +; PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[_HIDDEN_REMAINDER_Z]] to i32 +; PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 + %load = load i16, ptr addrspace(4) %gep + %conv = zext i16 %load to i32 + store i32 %conv, ptr addrspace(1) %out + ret void +} + + +define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) %out, i192 %t0, i32 %t1) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_max_user_sgprs +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i192 [[T0:%.*]], i32 [[T1:%.*]]) #[[ATTR0]] { ; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 -; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[TMP1]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 ; NO-PRELOAD-NEXT: ret void ; -; PRELOAD-LABEL: define amdgpu_kernel void @incompatible_attribute_block_count_x( -; PRELOAD-SAME: ptr addrspace(1) byref(i32) [[OUT:%.*]]) #[[ATTR0]] { -; PRELOAD-NEXT: [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-NEXT: [[OUT_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0 -; PRELOAD-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[OUT_BYVAL_KERNARG_OFFSET]] to ptr addrspace(1) +; PRELOAD-LABEL: define {{[^@]+}}@preload_block_max_user_sgprs +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i192 inreg [[T0:%.*]], i32 inreg [[T1:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0]] { ; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 -; PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[TMP1]], align 4 +; PRELOAD-NEXT: store i32 [[_HIDDEN_BLOCK_COUNT_X]], ptr addrspace(1) [[OUT]], align 4 ; PRELOAD-NEXT: ret void ; %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() @@ -217,6 +640,53 @@ define amdgpu_kernel void @incompatible_attribute_block_count_x(ptr addrspace(1) ret void } -;. -; NO-PRELOAD: [[META0]] = !{} -;. +define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_z_workgroup_size_z_remainder_z +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 8 +; NO-PRELOAD-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 16 +; NO-PRELOAD-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22 +; NO-PRELOAD-NEXT: [[LOAD0:%.*]] = load i32, ptr addrspace(4) [[GEP0]], align 4 +; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i16, ptr addrspace(4) [[GEP1]], align 2 +; NO-PRELOAD-NEXT: [[LOAD2:%.*]] = load i16, ptr addrspace(4) [[GEP2]], align 2 +; NO-PRELOAD-NEXT: [[CONV1:%.*]] = zext i16 [[LOAD1]] to i32 +; NO-PRELOAD-NEXT: [[CONV2:%.*]] = zext i16 [[LOAD2]] to i32 +; NO-PRELOAD-NEXT: [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[LOAD0]], i32 0 +; NO-PRELOAD-NEXT: [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[CONV1]], i32 1 +; NO-PRELOAD-NEXT: [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[CONV2]], i32 2 +; NO-PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT]], align 16 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_z_workgroup_size_z_remainder_z +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Z:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 8 +; PRELOAD-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 16 +; PRELOAD-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22 +; PRELOAD-NEXT: [[LOAD0:%.*]] = load i32, ptr addrspace(4) [[GEP0]], align 4 +; PRELOAD-NEXT: [[LOAD1:%.*]] = load i16, ptr addrspace(4) [[GEP1]], align 2 +; PRELOAD-NEXT: [[LOAD2:%.*]] = load i16, ptr addrspace(4) [[GEP2]], align 2 +; PRELOAD-NEXT: [[CONV1:%.*]] = zext i16 [[_HIDDEN_GROUP_SIZE_Z]] to i32 +; PRELOAD-NEXT: [[CONV2:%.*]] = zext i16 [[_HIDDEN_REMAINDER_Z]] to i32 +; PRELOAD-NEXT: [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[_HIDDEN_BLOCK_COUNT_Z]], i32 0 +; PRELOAD-NEXT: [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[CONV1]], i32 1 +; PRELOAD-NEXT: [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[CONV2]], i32 2 +; PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT]], align 16 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep0 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8 + %gep1 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16 + %gep2 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 + %load0 = load i32, ptr addrspace(4) %gep0 + %load1 = load i16, ptr addrspace(4) %gep1 + %load2 = load i16, ptr addrspace(4) %gep2 + %conv1 = zext i16 %load1 to i32 + %conv2 = zext i16 %load2 to i32 + %ins.0 = insertelement <3 x i32> poison, i32 %load0, i32 0 + %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv1, i32 1 + %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv2, i32 2 + store <3 x i32> %ins.2, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll index 31beb7a3cce24..7c667027bf542 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll @@ -1,8 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940 %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90a %s -define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0 { +define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x) #0 { ; GFX940-LABEL: preload_block_count_x: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -16,27 +15,13 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0 ; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preload_block_count_x: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB0_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB0_0: -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() - %load = load i32, ptr addrspace(4) %imp_arg_ptr - store i32 %load, ptr addrspace(1) %out + %load = load i32, ptr addrspace(4) %imp_arg_ptr, align 4 + store i32 %_hidden_block_count_x, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inreg %out, i32 inreg) #0 { +define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inreg %out, i32 inreg %0, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x) #0 { ; GFX940-LABEL: preload_unused_arg_block_count_x: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -51,60 +36,30 @@ define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inr ; GFX940-NEXT: v_mov_b32_e32 v1, s6 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preload_unused_arg_block_count_x: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB1_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB1_0: -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s10 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() - %load = load i32, ptr addrspace(4) %imp_arg_ptr - store i32 %load, ptr addrspace(1) %out + %load = load i32, ptr addrspace(4) %imp_arg_ptr, align 4 + store i32 %_hidden_block_count_x, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %out, i256 inreg) { +define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %out, i256 inreg %0, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x) #0 { ; GFX940-LABEL: no_free_sgprs_block_count_x: ; GFX940: ; %bb.1: -; GFX940-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x8 +; GFX940-NEXT: s_load_dword s12, s[0:1], 0x28 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_branch .LBB2_0 ; GFX940-NEXT: .p2align 8 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: .LBB2_0: -; GFX940-NEXT: s_load_dword s0, s[4:5], 0x28 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-NEXT: global_store_dword v0, v1, s[8:9] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v1, s12 +; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: no_free_sgprs_block_count_x: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x0 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB2_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB2_0: -; GFX90a-NEXT: s_load_dword s0, s[8:9], 0x28 -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[12:13] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() - %load = load i32, ptr addrspace(4) %imp_arg_ptr - store i32 %load, ptr addrspace(1) %out + %load = load i32, ptr addrspace(4) %imp_arg_ptr, align 4 + store i32 %_hidden_block_count_x, ptr addrspace(1) %out, align 4 ret void } @@ -118,26 +73,13 @@ define amdgpu_kernel void @no_inreg_block_count_x(ptr addrspace(1) %out) #0 { ; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: no_inreg_block_count_x: -; GFX90a: ; %bb.0: -; GFX90a-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() - %load = load i32, ptr addrspace(4) %imp_arg_ptr - store i32 %load, ptr addrspace(1) %out + %load = load i32, ptr addrspace(4) %imp_arg_ptr, align 4 + store i32 %load, ptr addrspace(1) %out, align 4 ret void } -; Implicit arg preloading is currently restricted to cases where all explicit -; args are inreg (preloaded). - -define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32 inreg) #0 { +define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32 inreg %0) #0 { ; GFX940-LABEL: mixed_inreg_block_count_x: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_load_dword s4, s[0:1], 0x10 @@ -147,19 +89,9 @@ define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32 ; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: mixed_inreg_block_count_x: -; GFX90a: ; %bb.0: -; GFX90a-NEXT: s_load_dword s2, s[4:5], 0x10 -; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() - %load = load i32, ptr addrspace(4) %imp_arg_ptr - store i32 %load, ptr addrspace(1) %out + %load = load i32, ptr addrspace(4) %imp_arg_ptr, align 4 + store i32 %load, ptr addrspace(1) %out, align 4 ret void } @@ -178,24 +110,9 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inr ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: incorrect_type_i64_block_count_x: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB5_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB5_0: -; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90a-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() - %load = load i64, ptr addrspace(4) %imp_arg_ptr - store i64 %load, ptr addrspace(1) %out + %load = load i64, ptr addrspace(4) %imp_arg_ptr, align 8 + store i64 %load, ptr addrspace(1) %out, align 8 ret void } @@ -214,28 +131,13 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inr ; GFX940-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: incorrect_type_i16_block_count_x: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB6_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB6_0: -; GFX90a-NEXT: s_load_dword s0, s[4:5], 0x8 -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_short v0, v1, s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() - %load = load i16, ptr addrspace(4) %imp_arg_ptr - store i16 %load, ptr addrspace(1) %out + %load = load i16, ptr addrspace(4) %imp_arg_ptr, align 2 + store i16 %load, ptr addrspace(1) %out, align 2 ret void } -define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out) #0 { +define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y) #0 { ; GFX940-LABEL: preload_block_count_y: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -249,24 +151,10 @@ define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out) #0 ; GFX940-NEXT: v_mov_b32_e32 v1, s5 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preload_block_count_y: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB7_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB7_0: -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4 - %load = load i32, ptr addrspace(4) %gep - store i32 %load, ptr addrspace(1) %out + %load = load i32, ptr addrspace(4) %gep, align 4 + store i32 %_hidden_block_count_y, ptr addrspace(1) %out, align 4 ret void } @@ -286,30 +174,14 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) ; GFX940-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: random_incorrect_offset: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB8_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB8_0: -; GFX90a-NEXT: s_mov_b32 s0, 8 -; GFX90a-NEXT: s_load_dword s0, s[4:5], s0 offset:0x2 -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2 - %load = load i32, ptr addrspace(4) %gep - store i32 %load, ptr addrspace(1) %out + %load = load i32, ptr addrspace(4) %gep, align 4 + store i32 %load, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out) #0 { +define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z) #0 { ; GFX940-LABEL: preload_block_count_z: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -324,29 +196,14 @@ define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out) #0 ; GFX940-NEXT: v_mov_b32_e32 v1, s6 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preload_block_count_z: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB9_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB9_0: -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s10 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8 - %load = load i32, ptr addrspace(4) %gep - store i32 %load, ptr addrspace(1) %out + %load = load i32, ptr addrspace(4) %gep, align 4 + store i32 %_hidden_block_count_z, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspace(1) inreg %out, i8 inreg %val) #0 { +define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspace(1) inreg %out, i8 inreg %val, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x) #0 { ; GFX940-LABEL: preload_block_count_x_imparg_align_ptr_i8: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -363,32 +220,15 @@ define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspa ; GFX940-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preload_block_count_x_imparg_align_ptr_i8: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB10_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB10_0: -; GFX90a-NEXT: s_and_b32 s0, s8, 0xff -; GFX90a-NEXT: s_add_i32 s0, s10, s0 -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() - %load = load i32, ptr addrspace(4) %imp_arg_ptr + %load = load i32, ptr addrspace(4) %imp_arg_ptr, align 4 %ext = zext i8 %val to i32 - %add = add i32 %load, %ext - store i32 %add, ptr addrspace(1) %out + %add = add i32 %_hidden_block_count_x, %ext + store i32 %add, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out) #0 { +define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z) #0 { ; GFX940-LABEL: preload_block_count_xyz: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -405,38 +245,21 @@ define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out) ; GFX940-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preload_block_count_xyz: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB11_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB11_0: -; GFX90a-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NEXT: v_mov_b32_e32 v0, s8 -; GFX90a-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-NEXT: v_mov_b32_e32 v2, s10 -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 0 - %load_x = load i32, ptr addrspace(4) %gep_x + %load_x = load i32, ptr addrspace(4) %gep_x, align 4 %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4 - %load_y = load i32, ptr addrspace(4) %gep_y + %load_y = load i32, ptr addrspace(4) %gep_y, align 4 %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8 - %load_z = load i32, ptr addrspace(4) %gep_z - %ins.0 = insertelement <3 x i32> poison, i32 %load_x, i32 0 - %ins.1 = insertelement <3 x i32> %ins.0, i32 %load_y, i32 1 - %ins.2 = insertelement <3 x i32> %ins.1, i32 %load_z, i32 2 - store <3 x i32> %ins.2, ptr addrspace(1) %out + %load_z = load i32, ptr addrspace(4) %gep_z, align 4 + %ins.0 = insertelement <3 x i32> poison, i32 %_hidden_block_count_x, i32 0 + %ins.1 = insertelement <3 x i32> %ins.0, i32 %_hidden_block_count_y, i32 1 + %ins.2 = insertelement <3 x i32> %ins.1, i32 %_hidden_block_count_z, i32 2 + store <3 x i32> %ins.2, ptr addrspace(1) %out, align 16 ret void } -define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out) #0 { +define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x) #0 { ; GFX940-LABEL: preload_workgroup_size_x: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -451,30 +274,15 @@ define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out) ; GFX940-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preload_workgroup_size_x: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB12_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB12_0: -; GFX90a-NEXT: s_and_b32 s0, s11, 0xffff -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12 - %load = load i16, ptr addrspace(4) %gep - %conv = zext i16 %load to i32 - store i32 %conv, ptr addrspace(1) %out + %load = load i16, ptr addrspace(4) %gep, align 2 + %conv = zext i16 %_hidden_group_size_x to i32 + store i32 %conv, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out) #0 { +define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y) #0 { ; GFX940-LABEL: preload_workgroup_size_y: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -489,30 +297,15 @@ define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out) ; GFX940-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preload_workgroup_size_y: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB13_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB13_0: -; GFX90a-NEXT: s_lshr_b32 s0, s11, 16 -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14 - %load = load i16, ptr addrspace(4) %gep - %conv = zext i16 %load to i32 - store i32 %conv, ptr addrspace(1) %out + %load = load i16, ptr addrspace(4) %gep, align 2 + %conv = zext i16 %_hidden_group_size_y to i32 + store i32 %conv, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out) #0 { +define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_z) #0 { ; GFX940-LABEL: preload_workgroup_size_z: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -528,31 +321,15 @@ define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out) ; GFX940-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preload_workgroup_size_z: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x18 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB14_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB14_0: -; GFX90a-NEXT: s_and_b32 s0, s12, 0xffff -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16 - %load = load i16, ptr addrspace(4) %gep - %conv = zext i16 %load to i32 - store i32 %conv, ptr addrspace(1) %out + %load = load i16, ptr addrspace(4) %gep, align 2 + %conv = zext i16 %_hidden_group_size_z to i32 + store i32 %conv, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %out) #0 { +define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_z) #0 { ; GFX940-LABEL: preload_workgroup_size_xyz: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -572,44 +349,24 @@ define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %ou ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preload_workgroup_size_xyz: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x18 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB15_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB15_0: -; GFX90a-NEXT: s_lshr_b32 s0, s11, 16 -; GFX90a-NEXT: s_and_b32 s1, s11, 0xffff -; GFX90a-NEXT: s_and_b32 s2, s12, 0xffff -; GFX90a-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NEXT: v_mov_b32_e32 v0, s1 -; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12 - %load_x = load i16, ptr addrspace(4) %gep_x - %conv_x = zext i16 %load_x to i32 + %load_x = load i16, ptr addrspace(4) %gep_x, align 2 + %conv_x = zext i16 %_hidden_group_size_x to i32 %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14 - %load_y = load i16, ptr addrspace(4) %gep_y - %conv_y = zext i16 %load_y to i32 + %load_y = load i16, ptr addrspace(4) %gep_y, align 2 + %conv_y = zext i16 %_hidden_group_size_y to i32 %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16 - %load_z = load i16, ptr addrspace(4) %gep_z - %conv_z = zext i16 %load_z to i32 - %ins.0 = insertelement <3 x i32> poison, i32 %conv_x, i32 0 - %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1 - %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2 - store <3 x i32> %ins.2, ptr addrspace(1) %out + %load_z = load i16, ptr addrspace(4) %gep_z, align 2 + %conv_z = zext i16 %_hidden_group_size_z to i32 + %ins.0 = insertelement <3 x i32> poison, i32 %conv_x, i32 0 + %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1 + %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2 + store <3 x i32> %ins.2, ptr addrspace(1) %out, align 16 ret void } -define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out) #0 { +define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_z, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_x) #0 { ; GFX940-LABEL: preload_remainder_x: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -625,31 +382,15 @@ define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out) #0 { ; GFX940-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preload_remainder_x: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x18 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB16_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB16_0: -; GFX90a-NEXT: s_lshr_b32 s0, s12, 16 -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18 - %load = load i16, ptr addrspace(4) %gep - %conv = zext i16 %load to i32 - store i32 %conv, ptr addrspace(1) %out + %load = load i16, ptr addrspace(4) %gep, align 2 + %conv = zext i16 %_hidden_remainder_x to i32 + store i32 %conv, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out) #0 { +define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_z, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_x, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_y) #0 { ; GFX940-LABEL: preloadremainder_y: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -665,31 +406,15 @@ define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out) #0 { ; GFX940-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preloadremainder_y: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB17_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB17_0: -; GFX90a-NEXT: s_and_b32 s0, s13, 0xffff -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20 - %load = load i16, ptr addrspace(4) %gep - %conv = zext i16 %load to i32 - store i32 %conv, ptr addrspace(1) %out + %load = load i16, ptr addrspace(4) %gep, align 2 + %conv = zext i16 %_hidden_remainder_y to i32 + store i32 %conv, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out) #0 { +define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_z, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_x, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_y, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_z) #0 { ; GFX940-LABEL: preloadremainder_z: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -705,31 +430,15 @@ define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out) #0 { ; GFX940-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preloadremainder_z: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB18_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB18_0: -; GFX90a-NEXT: s_lshr_b32 s0, s13, 16 -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 - %load = load i16, ptr addrspace(4) %gep - %conv = zext i16 %load to i32 - store i32 %conv, ptr addrspace(1) %out + %load = load i16, ptr addrspace(4) %gep, align 2 + %conv = zext i16 %_hidden_remainder_z to i32 + store i32 %conv, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0 { +define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_z, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_x, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_y, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_z) #0 { ; GFX940-LABEL: preloadremainder_xyz: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -749,47 +458,29 @@ define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0 ; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preloadremainder_xyz: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB19_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB19_0: -; GFX90a-NEXT: s_lshr_b32 s0, s13, 16 -; GFX90a-NEXT: s_lshr_b32 s1, s12, 16 -; GFX90a-NEXT: s_and_b32 s2, s13, 0xffff -; GFX90a-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NEXT: v_mov_b32_e32 v0, s1 -; GFX90a-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-NEXT: v_mov_b32_e32 v2, s0 -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18 - %load_x = load i16, ptr addrspace(4) %gep_x - %conv_x = zext i16 %load_x to i32 + %load_x = load i16, ptr addrspace(4) %gep_x, align 2 + %conv_x = zext i16 %_hidden_remainder_x to i32 %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20 - %load_y = load i16, ptr addrspace(4) %gep_y - %conv_y = zext i16 %load_y to i32 + %load_y = load i16, ptr addrspace(4) %gep_y, align 2 + %conv_y = zext i16 %_hidden_remainder_y to i32 %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 - %load_z = load i16, ptr addrspace(4) %gep_z - %conv_z = zext i16 %load_z to i32 - %ins.0 = insertelement <3 x i32> poison, i32 %conv_x, i32 0 - %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1 - %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2 - store <3 x i32> %ins.2, ptr addrspace(1) %out + %load_z = load i16, ptr addrspace(4) %gep_z, align 2 + %conv_z = zext i16 %_hidden_remainder_z to i32 + %ins.0 = insertelement <3 x i32> poison, i32 %conv_x, i32 0 + %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1 + %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2 + store <3 x i32> %ins.2, ptr addrspace(1) %out, align 16 ret void } -define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inreg %out) { +define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inreg %out, i128 inreg, i64 inreg, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_z, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_x, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_y, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_z) #0 { ; GFX940-LABEL: no_free_sgprs_preloadremainder_z: ; GFX940: ; %bb.1: -; GFX940-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x8 +; GFX940-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x28 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_branch .LBB20_0 ; GFX940-NEXT: .p2align 8 @@ -798,74 +489,41 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr ; GFX940-NEXT: s_lshr_b32 s0, s15, 16 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-NEXT: global_store_dword v0, v1, s[8:9] sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: no_free_sgprs_preloadremainder_z: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB20_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB20_0: -; GFX90a-NEXT: s_load_dword s0, s[8:9], 0x1c -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_lshr_b32 s0, s0, 16 -; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[12:13] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 - %load = load i16, ptr addrspace(4) %gep - %conv = zext i16 %load to i32 - store i32 %conv, ptr addrspace(1) %out + %load = load i16, ptr addrspace(4) %gep, align 2 + %conv = zext i16 %_hidden_remainder_z to i32 + store i32 %conv, ptr addrspace(1) %out, align 4 ret void } -; Check for consistency between isel and earlier passes preload SGPR accounting with max preload SGPRs. +; This should use s15 for the hidden argument. -define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %out, i192 inreg %t0, i32 inreg %t1) #0 { -; GFX940-LABEL: preload_block_max_user_sgprs: +define amdgpu_kernel void @preload_block_y_max_user_sgprs(ptr addrspace(1) inreg %out, i256 inreg, i64 inreg, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y) #0 { +; GFX940-LABEL: preload_block_y_max_user_sgprs: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x8 -; GFX940-NEXT: s_load_dword s12, s[0:1], 0x28 +; GFX940-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x28 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_branch .LBB21_0 ; GFX940-NEXT: .p2align 8 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: .LBB21_0: ; GFX940-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NEXT: v_mov_b32_e32 v1, s12 +; GFX940-NEXT: v_mov_b32_e32 v1, s15 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preload_block_max_user_sgprs: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 -; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x20 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB21_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB21_0: -; GFX90a-NEXT: s_load_dword s0, s[4:5], 0x28 -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() - %load = load i32, ptr addrspace(4) %imp_arg_ptr - store i32 %load, ptr addrspace(1) %out + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4 + %load = load i32, ptr addrspace(4) %gep, align 4 + store i32 %_hidden_block_count_y, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(ptr addrspace(1) inreg %out) #0 { +define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_z, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_x, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_y, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_z) #0 { ; GFX940-LABEL: preload_block_count_z_workgroup_size_z_remainder_z: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -884,39 +542,20 @@ define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(pt ; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preload_block_count_z_workgroup_size_z_remainder_z: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB22_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB22_0: -; GFX90a-NEXT: s_lshr_b32 s0, s13, 16 -; GFX90a-NEXT: s_and_b32 s1, s12, 0xffff -; GFX90a-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-NEXT: v_mov_b32_e32 v2, s0 -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep0 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8 %gep1 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16 %gep2 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 - %load0 = load i32, ptr addrspace(4) %gep0 - %load1 = load i16, ptr addrspace(4) %gep1 - %load2 = load i16, ptr addrspace(4) %gep2 - %conv1 = zext i16 %load1 to i32 - %conv2 = zext i16 %load2 to i32 - %ins.0 = insertelement <3 x i32> poison, i32 %load0, i32 0 - %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv1, i32 1 - %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv2, i32 2 - store <3 x i32> %ins.2, ptr addrspace(1) %out + %load0 = load i32, ptr addrspace(4) %gep0, align 4 + %load1 = load i16, ptr addrspace(4) %gep1, align 2 + %load2 = load i16, ptr addrspace(4) %gep2, align 2 + %conv1 = zext i16 %_hidden_group_size_z to i32 + %conv2 = zext i16 %_hidden_remainder_z to i32 + %ins.0 = insertelement <3 x i32> poison, i32 %_hidden_block_count_z, i32 0 + %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv1, i32 1 + %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv2, i32 2 + store <3 x i32> %ins.2, ptr addrspace(1) %out, align 16 ret void } -attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx940" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll index ab0fb7584d50c..89ad57a51ff1e 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll @@ -1,101 +1,56 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -S < %s | FileCheck -check-prefix=NO-PRELOAD %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=1 -S < %s | FileCheck -check-prefix=PRELOAD-1 %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=3 -S < %s | FileCheck -check-prefix=PRELOAD-3 %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=8 -S < %s | FileCheck -check-prefix=PRELOAD-8 %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -S < %s | FileCheck -check-prefix=NO-PRELOAD %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-kernarg-preload-count=2 -S < %s | FileCheck -check-prefix=PRELOAD-2 %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-kernarg-preload-count=100 -S < %s | FileCheck -check-prefix=PRELOAD-ALL %s -define amdgpu_kernel void @test_preload_IR_lowering_kernel_2(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2 +define amdgpu_kernel void @ptr1_ptr1_kernel(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_ptr1_kernel ; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { -; NO-PRELOAD-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT]], i64 0 -; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0:![0-9]+]] -; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT]], i64 8 -; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 -; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2 -; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { -; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT]], i64 8 -; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0:![0-9]+]] -; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2 -; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { -; PRELOAD-3-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2 -; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { -; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 -; PRELOAD-8-NEXT: ret void +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_ptr1_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; PRELOAD-2-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-2-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_ptr1_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; PRELOAD-ALL-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-ALL-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void ; %load = load i32, ptr addrspace(1) %in store i32 %load, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @test_preload_IR_lowering_kernel_4(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4 +define amdgpu_kernel void @ptr1_ptr1_ptr1_ptr1_kernel(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_kernel ; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 0 -; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 8 -; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 16 -; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 24 -; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 -; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4 -; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 8 -; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 16 -; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 24 -; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4 -; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-3-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 24 -; PRELOAD-3-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0:![0-9]+]] -; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 -; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 -; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4 -; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { -; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 -; PRELOAD-8-NEXT: ret void +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-2-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; PRELOAD-2-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-ALL-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; PRELOAD-ALL-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 +; PRELOAD-ALL-NEXT: ret void ; %load = load i32, ptr addrspace(1) %in %load1 = load i32, ptr addrspace(1) %in1 @@ -104,100 +59,42 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4(ptr addrspace(1) %i ret void } -define amdgpu_kernel void @test_preload_IR_lowering_kernel_8(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3, ptr addrspace(1) %out, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8 +define amdgpu_kernel void @ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_kernel(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3, ptr addrspace(1) %out, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_kernel ; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 0 -; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 8 -; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 16 -; NO-PRELOAD-NEXT: [[IN2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 24 -; NO-PRELOAD-NEXT: [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 32 -; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 40 -; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 48 -; NO-PRELOAD-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56 -; NO-PRELOAD-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 -; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; NO-PRELOAD-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2_LOAD]], align 4 -; NO-PRELOAD-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4 -; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 -; NO-PRELOAD-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2_LOAD]], align 4 -; NO-PRELOAD-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4 -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8 -; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 8 -; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 16 -; PRELOAD-1-NEXT: [[IN2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 24 -; PRELOAD-1-NEXT: [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 32 -; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 40 -; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 48 -; PRELOAD-1-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56 -; PRELOAD-1-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; PRELOAD-1-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2_LOAD]], align 4 -; PRELOAD-1-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2_LOAD]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4 -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8 -; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-3-NEXT: [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 24 -; PRELOAD-3-NEXT: [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-3-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 32 -; PRELOAD-3-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; PRELOAD-3-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 40 -; PRELOAD-3-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-3-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 48 -; PRELOAD-3-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; PRELOAD-3-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56 -; PRELOAD-3-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 -; PRELOAD-3-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2]], align 4 -; PRELOAD-3-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4 -; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 -; PRELOAD-3-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2_LOAD]], align 4 -; PRELOAD-3-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4 -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8 -; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) inreg [[IN3:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]], ptr addrspace(1) inreg [[OUT3:%.*]]) #[[ATTR0]] { -; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-8-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56 -; PRELOAD-8-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0:![0-9]+]] -; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 -; PRELOAD-8-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2]], align 4 -; PRELOAD-8-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4 -; PRELOAD-8-NEXT: ret void +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; NO-PRELOAD-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2]], align 4 +; NO-PRELOAD-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-2-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; PRELOAD-2-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2]], align 4 +; PRELOAD-2-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3]], align 4 +; PRELOAD-2-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 +; PRELOAD-2-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2]], align 4 +; PRELOAD-2-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) inreg [[IN3:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-ALL-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; PRELOAD-ALL-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2]], align 4 +; PRELOAD-ALL-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3]], align 4 +; PRELOAD-ALL-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 +; PRELOAD-ALL-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2]], align 4 +; PRELOAD-ALL-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3]], align 4 +; PRELOAD-ALL-NEXT: ret void ; %load = load i32, ptr addrspace(1) %in %load1 = load i32, ptr addrspace(1) %in1 @@ -210,58 +107,30 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_8(ptr addrspace(1) %i ret void } -; Preload args with inreg in the NO-PRELOAD case. - -define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) inreg %out, ptr addrspace(1) inreg %out1) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset -; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 0 -; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 8 -; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 16 -; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 24 -; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 -; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset -; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 8 -; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 16 -; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 24 -; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset -; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 -; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 -; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset -; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { -; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 -; PRELOAD-8-NEXT: ret void +define amdgpu_kernel void @ptr1_ptr1_ptr1_ptr1_inreg_offset_kernel(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_inreg_offset_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_inreg_offset_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-2-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; PRELOAD-2-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_inreg_offset_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-ALL-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; PRELOAD-ALL-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 +; PRELOAD-ALL-NEXT: ret void ; %load = load i32, ptr addrspace(1) %in %load1 = load i32, ptr addrspace(1) %in1 @@ -270,56 +139,30 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset(ptr ad ret void } -; Only preload the first sequence of arguments with the inreg attribute. In the NO-PRELOAD case this is just the first argument. - -define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence(ptr addrspace(1) inreg %in, ptr addrspace(1) %in1, ptr addrspace(1) inreg %out, ptr addrspace(1) inreg %out1) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence -; NO-PRELOAD-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 8 -; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 16 -; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 24 -; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] +define amdgpu_kernel void @ptr1_ptr1_ptr1_ptr1_inreg_offset_two_sequence_kernel(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_inreg_offset_two_sequence_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { ; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence -; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 8 -; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 16 -; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 24 -; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence -; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 -; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 -; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence -; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { -; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 -; PRELOAD-8-NEXT: ret void +; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_inreg_offset_two_sequence_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-2-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; PRELOAD-2-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_inreg_offset_two_sequence_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-ALL-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; PRELOAD-ALL-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 +; PRELOAD-ALL-NEXT: ret void ; %load = load i32, ptr addrspace(1) %in %load1 = load i32, ptr addrspace(1) %in1 @@ -328,73 +171,36 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset_two_se ret void } -define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_misaligned(i16 %arg0, ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_misaligned +define amdgpu_kernel void @i16_ptr1_ptr1_ptr1_ptr1_misaligned_kernel(i16 %arg0, ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@i16_ptr1_ptr1_ptr1_ptr1_misaligned_kernel ; NO-PRELOAD-SAME: (i16 [[ARG0:%.*]], ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 0 -; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 -; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 8 -; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 16 -; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 24 -; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 32 -; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 -; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i16 [[TMP2]] to i32 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 ; NO-PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]] -; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_misaligned -; PRELOAD-1-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-1-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 8 -; PRELOAD-1-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 16 -; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 24 -; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 32 -; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 -; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; PRELOAD-1-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 -; PRELOAD-1-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]] -; PRELOAD-1-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_misaligned -; PRELOAD-3-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-3-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 24 -; PRELOAD-3-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-3-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 32 -; PRELOAD-3-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 -; PRELOAD-3-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 -; PRELOAD-3-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]] -; PRELOAD-3-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_misaligned -; PRELOAD-8-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { -; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 -; PRELOAD-8-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 -; PRELOAD-8-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]] -; PRELOAD-8-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 -; PRELOAD-8-NEXT: ret void +; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@i16_ptr1_ptr1_ptr1_ptr1_misaligned_kernel +; PRELOAD-2-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-2-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; PRELOAD-2-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 +; PRELOAD-2-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]] +; PRELOAD-2-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@i16_ptr1_ptr1_ptr1_ptr1_misaligned_kernel +; PRELOAD-ALL-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-ALL-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; PRELOAD-ALL-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 +; PRELOAD-ALL-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]] +; PRELOAD-ALL-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 +; PRELOAD-ALL-NEXT: ret void ; %load = load i32, ptr addrspace(1) %in %load1 = load i32, ptr addrspace(1) %in1 @@ -405,59 +211,30 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_misaligned(i16 %arg ret void } -; In this case both i16 args with be preloaded into the first SGPR. - -define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_i16_i16(i16 %arg0, i16 %arg1, ptr addrspace(1) %out) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_i16_i16 +define amdgpu_kernel void @i16_i16_ptr1_kernel(i16 %arg0, i16 %arg1, ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@i16_i16_ptr1_kernel ; NO-PRELOAD-SAME: (i16 [[ARG0:%.*]], i16 [[ARG1:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0 -; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 -; NO-PRELOAD-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0 -; NO-PRELOAD-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16 -; NO-PRELOAD-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 -; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 8 -; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i16 [[TMP2]] to i32 -; NO-PRELOAD-NEXT: [[EXT1:%.*]] = zext i16 [[TMP5]] to i32 +; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 +; NO-PRELOAD-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32 ; NO-PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]] -; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_i16_i16 -; PRELOAD-1-SAME: (i16 inreg [[ARG0:%.*]], i16 [[ARG1:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-1-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0 -; PRELOAD-1-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16 -; PRELOAD-1-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16 -; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 8 -; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 -; PRELOAD-1-NEXT: [[EXT1:%.*]] = zext i16 [[TMP3]] to i32 -; PRELOAD-1-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]] -; PRELOAD-1-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_i16_i16 -; PRELOAD-3-SAME: (i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-3-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 -; PRELOAD-3-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32 -; PRELOAD-3-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]] -; PRELOAD-3-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_i16_i16 -; PRELOAD-8-SAME: (i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { -; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-8-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 -; PRELOAD-8-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32 -; PRELOAD-8-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]] -; PRELOAD-8-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 -; PRELOAD-8-NEXT: ret void +; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@i16_i16_ptr1_kernel +; PRELOAD-2-SAME: (i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 +; PRELOAD-2-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32 +; PRELOAD-2-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]] +; PRELOAD-2-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@i16_i16_ptr1_kernel +; PRELOAD-ALL-SAME: (i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 +; PRELOAD-ALL-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32 +; PRELOAD-ALL-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]] +; PRELOAD-ALL-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void ; %ext = zext i16 %arg0 to i32 %ext1 = zext i16 %arg1 to i32 @@ -466,4 +243,754 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_i16_i16(i16 %arg0, ret void } -attributes #0 = { nounwind } +define amdgpu_kernel void @ptr1_i8_kernel(ptr addrspace(1) %out, i8 %arg0) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i8_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i8 [[ARG0:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i8 [[ARG0]] to i32 +; NO-PRELOAD-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i8_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i8 inreg [[ARG0:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[EXT:%.*]] = zext i8 [[ARG0]] to i32 +; PRELOAD-2-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i8_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i8 inreg [[ARG0:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[EXT:%.*]] = zext i8 [[ARG0]] to i32 +; PRELOAD-ALL-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + %ext = zext i8 %arg0 to i32 + store i32 %ext, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_i8_zeroext_kernel(ptr addrspace(1) %out, i8 zeroext %arg0) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i8_zeroext_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i8 zeroext [[ARG0:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i8 [[ARG0]] to i32 +; NO-PRELOAD-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i8_zeroext_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i8 inreg zeroext [[ARG0:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[EXT:%.*]] = zext i8 [[ARG0]] to i32 +; PRELOAD-2-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i8_zeroext_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i8 inreg zeroext [[ARG0:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[EXT:%.*]] = zext i8 [[ARG0]] to i32 +; PRELOAD-ALL-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + %ext = zext i8 %arg0 to i32 + store i32 %ext, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @ptr1_i16_kernel(ptr addrspace(1) %out, i16 %arg0) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i16_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i16 [[ARG0:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 +; NO-PRELOAD-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i16_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[ARG0:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 +; PRELOAD-2-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i16_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[ARG0:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 +; PRELOAD-ALL-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + %ext = zext i16 %arg0 to i32 + store i32 %ext, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @ptr1_i32_kernel(ptr addrspace(1) %out, i32 %arg0) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i32_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[ARG0:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: store i32 [[ARG0]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i32_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg [[ARG0:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: store i32 [[ARG0]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i32_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg [[ARG0:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: store i32 [[ARG0]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + store i32 %arg0, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @i32_ptr1_i32_kernel(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@i32_ptr1_i32_kernel +; NO-PRELOAD-SAME: (i32 [[ARG0:%.*]], ptr addrspace(1) [[OUT:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[ARG0]], [[ARG1]] +; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@i32_ptr1_i32_kernel +; PRELOAD-2-SAME: (i32 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[ADD:%.*]] = add i32 [[ARG0]], [[ARG1]] +; PRELOAD-2-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@i32_ptr1_i32_kernel +; PRELOAD-ALL-SAME: (i32 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg [[ARG1:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[ADD:%.*]] = add i32 [[ARG0]], [[ARG1]] +; PRELOAD-ALL-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + %add = add i32 %arg0, %arg1 + store i32 %add, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_i16_i16_kernel(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i16_i16_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i16 [[ARG0:%.*]], i16 [[ARG1:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 +; NO-PRELOAD-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32 +; NO-PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]] +; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i16_i16_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[ARG0:%.*]], i16 [[ARG1:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 +; PRELOAD-2-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32 +; PRELOAD-2-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]] +; PRELOAD-2-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i16_i16_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 +; PRELOAD-ALL-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32 +; PRELOAD-ALL-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]] +; PRELOAD-ALL-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + %ext = zext i16 %arg0 to i32 + %ext1 = zext i16 %arg1 to i32 + %add = add i32 %ext, %ext1 + store i32 %add, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @ptr1_v2i8_kernel(ptr addrspace(1) %out, <2 x i8> %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v2i8_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x i8> [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: store <2 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 2 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v2i8_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <2 x i8> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: store <2 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v2i8_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <2 x i8> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: store <2 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-ALL-NEXT: ret void +; + store <2 x i8> %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_byref_i32_i32_kernel(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_byref_i32_i32_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(4) byref(i32) align 256 [[IN_BYREF:%.*]], i32 [[AFTER_OFFSET:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF]], align 4 +; NO-PRELOAD-NEXT: store volatile i32 [[IN]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: store volatile i32 [[AFTER_OFFSET]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_byref_i32_i32_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(4) byref(i32) align 256 [[IN_BYREF:%.*]], i32 [[AFTER_OFFSET:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF]], align 4 +; PRELOAD-2-NEXT: store volatile i32 [[IN]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: store volatile i32 [[AFTER_OFFSET]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_byref_i32_i32_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(4) byref(i32) align 256 [[IN_BYREF:%.*]], i32 [[AFTER_OFFSET:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF]], align 4 +; PRELOAD-ALL-NEXT: store volatile i32 [[IN]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: store volatile i32 [[AFTER_OFFSET]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + %in = load i32, ptr addrspace(4) %in.byref + store volatile i32 %in, ptr addrspace(1) %out, align 4 + store volatile i32 %after.offset, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @ptr1_byref_i32_i32_staggered_kernel(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_byref_i32_i32_staggered_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(4) byref(i32) align 256 [[IN_BYREF:%.*]], i32 [[AFTER_OFFSET:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF]], align 4 +; NO-PRELOAD-NEXT: store volatile i32 [[IN]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: store volatile i32 [[AFTER_OFFSET]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_byref_i32_i32_staggered_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(4) byref(i32) align 256 [[IN_BYREF:%.*]], i32 [[AFTER_OFFSET:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF]], align 4 +; PRELOAD-2-NEXT: store volatile i32 [[IN]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: store volatile i32 [[AFTER_OFFSET]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_byref_i32_i32_staggered_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(4) byref(i32) align 256 [[IN_BYREF:%.*]], i32 [[AFTER_OFFSET:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF]], align 4 +; PRELOAD-ALL-NEXT: store volatile i32 [[IN]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: store volatile i32 [[AFTER_OFFSET]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + %in = load i32, ptr addrspace(4) %in.byref + store volatile i32 %in, ptr addrspace(1) %out, align 4 + store volatile i32 %after.offset, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @ptr1_v8i32_kernel(ptr addrspace(1) nocapture %out, <8 x i32> %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v8i32_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) captures(none) [[OUT:%.*]], <8 x i32> [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: store <8 x i32> [[IN]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v8i32_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg captures(none) [[OUT:%.*]], <8 x i32> [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: store <8 x i32> [[IN]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v8i32_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg captures(none) [[OUT:%.*]], <8 x i32> [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: store <8 x i32> [[IN]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + store <8 x i32> %in, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @ptr1_v3i16_kernel(ptr addrspace(1) nocapture %out, <3 x i16> %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v3i16_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) captures(none) [[OUT:%.*]], <3 x i16> [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: store <3 x i16> [[IN]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v3i16_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg captures(none) [[OUT:%.*]], <3 x i16> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: store <3 x i16> [[IN]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v3i16_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg captures(none) [[OUT:%.*]], <3 x i16> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: store <3 x i16> [[IN]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + store <3 x i16> %in, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @ptr1_v3i32_kernel(ptr addrspace(1) nocapture %out, <3 x i32> %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v3i32_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) captures(none) [[OUT:%.*]], <3 x i32> [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: store <3 x i32> [[IN]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v3i32_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg captures(none) [[OUT:%.*]], <3 x i32> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: store <3 x i32> [[IN]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v3i32_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg captures(none) [[OUT:%.*]], <3 x i32> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: store <3 x i32> [[IN]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + store <3 x i32> %in, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @ptr1_v3f32_kernel(ptr addrspace(1) nocapture %out, <3 x float> %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v3f32_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) captures(none) [[OUT:%.*]], <3 x float> [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: store <3 x float> [[IN]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v3f32_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg captures(none) [[OUT:%.*]], <3 x float> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: store <3 x float> [[IN]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v3f32_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg captures(none) [[OUT:%.*]], <3 x float> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: store <3 x float> [[IN]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + store <3 x float> %in, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @ptr1_v5i8_kernel(ptr addrspace(1) nocapture %out, <5 x i8> %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v5i8_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) captures(none) [[OUT:%.*]], <5 x i8> [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: store <5 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v5i8_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg captures(none) [[OUT:%.*]], <5 x i8> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: store <5 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v5i8_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg captures(none) [[OUT:%.*]], <5 x i8> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: store <5 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + store <5 x i8> %in, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @ptr1_v5f64_kernel(ptr addrspace(1) nocapture %out, <5 x double> %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v5f64_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) captures(none) [[OUT:%.*]], <5 x double> [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: store <5 x double> [[IN]], ptr addrspace(1) [[OUT]], align 8 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v5f64_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg captures(none) [[OUT:%.*]], <5 x double> [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: store <5 x double> [[IN]], ptr addrspace(1) [[OUT]], align 8 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v5f64_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg captures(none) [[OUT:%.*]], <5 x double> [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: store <5 x double> [[IN]], ptr addrspace(1) [[OUT]], align 8 +; PRELOAD-ALL-NEXT: ret void +; + store <5 x double> %in, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @ptr1_v8i8_kernel(ptr addrspace(1) %out, <8 x i8> %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v8i8_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i8> [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: store <8 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 8 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v8i8_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <8 x i8> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: store <8 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 8 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v8i8_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <8 x i8> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: store <8 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 8 +; PRELOAD-ALL-NEXT: ret void +; + store <8 x i8> %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_i64_kernel(ptr addrspace(1) %out, i64 %a) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i64_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i64 [[A:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: store i64 [[A]], ptr addrspace(1) [[OUT]], align 8 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i64_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i64 inreg [[A:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: store i64 [[A]], ptr addrspace(1) [[OUT]], align 8 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i64_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i64 inreg [[A:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: store i64 [[A]], ptr addrspace(1) [[OUT]], align 8 +; PRELOAD-ALL-NEXT: ret void +; + store i64 %a, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @ptr1_f64_kernel(ptr addrspace(1) %out, double %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_f64_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], double [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: store double [[IN]], ptr addrspace(1) [[OUT]], align 8 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_f64_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], double inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: store double [[IN]], ptr addrspace(1) [[OUT]], align 8 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_f64_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], double inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: store double [[IN]], ptr addrspace(1) [[OUT]], align 8 +; PRELOAD-ALL-NEXT: ret void +; + store double %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_half_kernel(ptr addrspace(1) %out, half %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_half_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], half [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: store half [[IN]], ptr addrspace(1) [[OUT]], align 2 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_half_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], half inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: store half [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_half_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], half inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: store half [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-ALL-NEXT: ret void +; + store half %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_bfloat_kernel(ptr addrspace(1) %out, bfloat %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_bfloat_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], bfloat [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: store bfloat [[IN]], ptr addrspace(1) [[OUT]], align 2 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_bfloat_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], bfloat inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: store bfloat [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_bfloat_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], bfloat inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: store bfloat [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-ALL-NEXT: ret void +; + store bfloat %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_v2bfloat_kernel(ptr addrspace(1) %out, <2 x bfloat> %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v2bfloat_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x bfloat> [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: store <2 x bfloat> [[IN]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v2bfloat_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <2 x bfloat> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: store <2 x bfloat> [[IN]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v2bfloat_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <2 x bfloat> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: store <2 x bfloat> [[IN]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + store <2 x bfloat> %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_v3bfloat_kernel(ptr addrspace(1) %out, <3 x bfloat> %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v3bfloat_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <3 x bfloat> [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: store <3 x bfloat> [[IN]], ptr addrspace(1) [[OUT]], align 8 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v3bfloat_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <3 x bfloat> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: store <3 x bfloat> [[IN]], ptr addrspace(1) [[OUT]], align 8 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v3bfloat_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <3 x bfloat> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: store <3 x bfloat> [[IN]], ptr addrspace(1) [[OUT]], align 8 +; PRELOAD-ALL-NEXT: ret void +; + store <3 x bfloat> %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_v6bfloat_kernel(ptr addrspace(1) %out, <6 x bfloat> %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v6bfloat_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <6 x bfloat> [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: store <6 x bfloat> [[IN]], ptr addrspace(1) [[OUT]], align 16 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v6bfloat_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <6 x bfloat> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: store <6 x bfloat> [[IN]], ptr addrspace(1) [[OUT]], align 16 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v6bfloat_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <6 x bfloat> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: store <6 x bfloat> [[IN]], ptr addrspace(1) [[OUT]], align 16 +; PRELOAD-ALL-NEXT: ret void +; + store <6 x bfloat> %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_half_v7bfloat_kernel(ptr addrspace(1) %out, half %in, <7 x bfloat> %in2, ptr addrspace(1) %out2) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_half_v7bfloat_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], half [[IN:%.*]], <7 x bfloat> [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: store half [[IN]], ptr addrspace(1) [[OUT]], align 2 +; NO-PRELOAD-NEXT: store <7 x bfloat> [[IN2]], ptr addrspace(1) [[OUT2]], align 16 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_half_v7bfloat_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], half inreg [[IN:%.*]], <7 x bfloat> [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: store half [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-2-NEXT: store <7 x bfloat> [[IN2]], ptr addrspace(1) [[OUT2]], align 16 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_half_v7bfloat_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], half inreg [[IN:%.*]], <7 x bfloat> inreg [[IN2:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: store half [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-ALL-NEXT: store <7 x bfloat> [[IN2]], ptr addrspace(1) [[OUT2]], align 16 +; PRELOAD-ALL-NEXT: ret void +; + store half %in, ptr addrspace(1) %out + store <7 x bfloat> %in2, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @ptr1_i1_kernel(ptr addrspace(1) %out, i1 %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i1_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i1 [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: store i1 [[IN]], ptr addrspace(1) [[OUT]], align 1 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i1_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i1 inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: store i1 [[IN]], ptr addrspace(1) [[OUT]], align 1 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i1_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i1 inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: store i1 [[IN]], ptr addrspace(1) [[OUT]], align 1 +; PRELOAD-ALL-NEXT: ret void +; + store i1 %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_fp128_kernel(ptr addrspace(1) %out, fp128 %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_fp128_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], fp128 [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: store fp128 [[IN]], ptr addrspace(1) [[OUT]], align 16 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_fp128_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], fp128 inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: store fp128 [[IN]], ptr addrspace(1) [[OUT]], align 16 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_fp128_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], fp128 inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: store fp128 [[IN]], ptr addrspace(1) [[OUT]], align 16 +; PRELOAD-ALL-NEXT: ret void +; + store fp128 %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_v7i8_kernel(ptr addrspace(1) %out, <7 x i8> %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v7i8_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <7 x i8> [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: store <7 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 8 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v7i8_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <7 x i8> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: store <7 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 8 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v7i8_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <7 x i8> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: store <7 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 8 +; PRELOAD-ALL-NEXT: ret void +; + store <7 x i8> %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_v7half_kernel(ptr addrspace(1) %out, <7 x half> %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v7half_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <7 x half> [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: store <7 x half> [[IN]], ptr addrspace(1) [[OUT]], align 16 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v7half_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <7 x half> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: store <7 x half> [[IN]], ptr addrspace(1) [[OUT]], align 16 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v7half_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <7 x half> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: store <7 x half> [[IN]], ptr addrspace(1) [[OUT]], align 16 +; PRELOAD-ALL-NEXT: ret void +; + store <7 x half> %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_i16_i32_ptr1_kernel(ptr addrspace(1) %out, i16 %in, i32 %in2, ptr addrspace(1) %out2) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i16_i32_ptr1_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i16 [[IN:%.*]], i32 [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2 +; NO-PRELOAD-NEXT: store i32 [[IN2]], ptr addrspace(1) [[OUT2]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i16_i32_ptr1_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], i32 [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-2-NEXT: store i32 [[IN2]], ptr addrspace(1) [[OUT2]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i16_i32_ptr1_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], i32 inreg [[IN2:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-ALL-NEXT: store i32 [[IN2]], ptr addrspace(1) [[OUT2]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + store i16 %in, ptr addrspace(1) %out + store i32 %in2, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @ptr1_i16_v3i32_ptr1_kernel(ptr addrspace(1) %out, i16 %in, <3 x i32> %in2, ptr addrspace(1) %out2) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i16_v3i32_ptr1_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i16 [[IN:%.*]], <3 x i32> [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2 +; NO-PRELOAD-NEXT: store <3 x i32> [[IN2]], ptr addrspace(1) [[OUT2]], align 16 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i16_v3i32_ptr1_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], <3 x i32> [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-2-NEXT: store <3 x i32> [[IN2]], ptr addrspace(1) [[OUT2]], align 16 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i16_v3i32_ptr1_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], <3 x i32> inreg [[IN2:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-ALL-NEXT: store <3 x i32> [[IN2]], ptr addrspace(1) [[OUT2]], align 16 +; PRELOAD-ALL-NEXT: ret void +; + store i16 %in, ptr addrspace(1) %out + store <3 x i32> %in2, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @ptr1_i16_i16_ptr1_kernel(ptr addrspace(1) %out, i16 %in, i16 %in2, ptr addrspace(1) %out2) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i16_i16_ptr1_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i16 [[IN:%.*]], i16 [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2 +; NO-PRELOAD-NEXT: store i16 [[IN2]], ptr addrspace(1) [[OUT2]], align 2 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i16_i16_ptr1_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], i16 [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-2-NEXT: store i16 [[IN2]], ptr addrspace(1) [[OUT2]], align 2 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i16_i16_ptr1_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], i16 inreg [[IN2:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-ALL-NEXT: store i16 [[IN2]], ptr addrspace(1) [[OUT2]], align 2 +; PRELOAD-ALL-NEXT: ret void +; + store i16 %in, ptr addrspace(1) %out + store i16 %in2, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @ptr1_i16_v2i8_ptr1_kernel(ptr addrspace(1) %out, i16 %in, <2 x i8> %in2, ptr addrspace(1) %out2) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i16_v2i8_ptr1_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i16 [[IN:%.*]], <2 x i8> [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2 +; NO-PRELOAD-NEXT: store <2 x i8> [[IN2]], ptr addrspace(1) [[OUT2]], align 2 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i16_v2i8_ptr1_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], <2 x i8> [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-2-NEXT: store <2 x i8> [[IN2]], ptr addrspace(1) [[OUT2]], align 2 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i16_v2i8_ptr1_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], <2 x i8> inreg [[IN2:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-ALL-NEXT: store <2 x i8> [[IN2]], ptr addrspace(1) [[OUT2]], align 2 +; PRELOAD-ALL-NEXT: ret void +; + store i16 %in, ptr addrspace(1) %out + store <2 x i8> %in2, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @i32_ptr1_i32_staggered_kernel(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@i32_ptr1_i32_staggered_kernel +; NO-PRELOAD-SAME: (i32 [[ARG0:%.*]], ptr addrspace(1) [[OUT:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[ARG0]], [[ARG1]] +; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@i32_ptr1_i32_staggered_kernel +; PRELOAD-2-SAME: (i32 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[ADD:%.*]] = add i32 [[ARG0]], [[ARG1]] +; PRELOAD-2-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@i32_ptr1_i32_staggered_kernel +; PRELOAD-ALL-SAME: (i32 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg [[ARG1:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[ADD:%.*]] = add i32 [[ARG0]], [[ARG1]] +; PRELOAD-ALL-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + %add = add i32 %arg0, %arg1 + store i32 %add, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_i8_i32_trailing_unused_kernel(ptr addrspace(1) %out, i8 %arg0, i32 %unused) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i8_i32_trailing_unused_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i8 [[ARG0:%.*]], i32 [[UNUSED:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i8 [[ARG0]] to i32 +; NO-PRELOAD-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i8_i32_trailing_unused_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i8 inreg [[ARG0:%.*]], i32 [[UNUSED:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[EXT:%.*]] = zext i8 [[ARG0]] to i32 +; PRELOAD-2-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i8_i32_trailing_unused_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i8 inreg [[ARG0:%.*]], i32 inreg [[UNUSED:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[EXT:%.*]] = zext i8 [[ARG0]] to i32 +; PRELOAD-ALL-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + %ext = zext i8 %arg0 to i32 + store i32 %ext, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll deleted file mode 100644 index 20edbd6c0d0fa..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll +++ /dev/null @@ -1,263 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor -S < %s | FileCheck -check-prefix=NO-PRELOAD %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=1 -passes=amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-1 %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=3 -passes=amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-3 %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=16 -passes=amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-16 %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=20 -passes=amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-20 %s - -define amdgpu_kernel void @test_preload_hint_kernel_1(ptr %0) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1 -; NO-PRELOAD-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1 -; PRELOAD-1-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1 -; PRELOAD-3-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1 -; PRELOAD-16-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1 -; PRELOAD-20-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -define amdgpu_kernel void @test_preload_hint_kernel_2(i32 %0, i64 %1) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2 -; NO-PRELOAD-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2 -; PRELOAD-1-SAME: (i32 inreg [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2 -; PRELOAD-3-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2 -; PRELOAD-16-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2 -; PRELOAD-20-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -define amdgpu_kernel void @test_preload_hint_kernel_4(i32 %0, i64 %1, <2 x float> %2, ptr %3) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_4 -; NO-PRELOAD-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]], <2 x float> [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_4 -; PRELOAD-1-SAME: (i32 inreg [[TMP0:%.*]], i64 [[TMP1:%.*]], <2 x float> [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_4 -; PRELOAD-3-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_4 -; PRELOAD-16-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr inreg [[TMP3:%.*]]) #[[ATTR0]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_4 -; PRELOAD-20-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr inreg [[TMP3:%.*]]) #[[ATTR0]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -define amdgpu_kernel void @test_preload_hint_kernel_18(i32 %0, i64 %1, <2 x float> %2, ptr %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, i32 %14, i32 %15, i32 %16, i32 %17) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18 -; NO-PRELOAD-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]], <2 x float> [[TMP2:%.*]], ptr [[TMP3:%.*]], i32 [[TMP4:%.*]], i32 [[TMP5:%.*]], i32 [[TMP6:%.*]], i32 [[TMP7:%.*]], i32 [[TMP8:%.*]], i32 [[TMP9:%.*]], i32 [[TMP10:%.*]], i32 [[TMP11:%.*]], i32 [[TMP12:%.*]], i32 [[TMP13:%.*]], i32 [[TMP14:%.*]], i32 [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18 -; PRELOAD-1-SAME: (i32 inreg [[TMP0:%.*]], i64 [[TMP1:%.*]], <2 x float> [[TMP2:%.*]], ptr [[TMP3:%.*]], i32 [[TMP4:%.*]], i32 [[TMP5:%.*]], i32 [[TMP6:%.*]], i32 [[TMP7:%.*]], i32 [[TMP8:%.*]], i32 [[TMP9:%.*]], i32 [[TMP10:%.*]], i32 [[TMP11:%.*]], i32 [[TMP12:%.*]], i32 [[TMP13:%.*]], i32 [[TMP14:%.*]], i32 [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18 -; PRELOAD-3-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr [[TMP3:%.*]], i32 [[TMP4:%.*]], i32 [[TMP5:%.*]], i32 [[TMP6:%.*]], i32 [[TMP7:%.*]], i32 [[TMP8:%.*]], i32 [[TMP9:%.*]], i32 [[TMP10:%.*]], i32 [[TMP11:%.*]], i32 [[TMP12:%.*]], i32 [[TMP13:%.*]], i32 [[TMP14:%.*]], i32 [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18 -; PRELOAD-16-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr inreg [[TMP3:%.*]], i32 inreg [[TMP4:%.*]], i32 inreg [[TMP5:%.*]], i32 inreg [[TMP6:%.*]], i32 inreg [[TMP7:%.*]], i32 inreg [[TMP8:%.*]], i32 inreg [[TMP9:%.*]], i32 inreg [[TMP10:%.*]], i32 inreg [[TMP11:%.*]], i32 inreg [[TMP12:%.*]], i32 inreg [[TMP13:%.*]], i32 inreg [[TMP14:%.*]], i32 inreg [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18 -; PRELOAD-20-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr inreg [[TMP3:%.*]], i32 inreg [[TMP4:%.*]], i32 inreg [[TMP5:%.*]], i32 inreg [[TMP6:%.*]], i32 inreg [[TMP7:%.*]], i32 inreg [[TMP8:%.*]], i32 inreg [[TMP9:%.*]], i32 inreg [[TMP10:%.*]], i32 inreg [[TMP11:%.*]], i32 inreg [[TMP12:%.*]], i32 inreg [[TMP13:%.*]], i32 inreg [[TMP14:%.*]], i32 inreg [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -define void @test_preload_hint_non_kernel_2(i32 %0, i64 %1) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_non_kernel_2 -; NO-PRELOAD-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_non_kernel_2 -; PRELOAD-1-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_non_kernel_2 -; PRELOAD-3-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_non_kernel_2 -; PRELOAD-16-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_non_kernel_2 -; PRELOAD-20-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -define amdgpu_kernel void @test_preload_hint_kernel_1_call_func(ptr %0) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_func -; NO-PRELOAD-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] { -; NO-PRELOAD-NEXT: call void @func(ptr [[TMP0]]) -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_func -; PRELOAD-1-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] { -; PRELOAD-1-NEXT: call void @func(ptr [[TMP0]]) -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_func -; PRELOAD-3-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] { -; PRELOAD-3-NEXT: call void @func(ptr [[TMP0]]) -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_func -; PRELOAD-16-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] { -; PRELOAD-16-NEXT: call void @func(ptr [[TMP0]]) -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_func -; PRELOAD-20-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] { -; PRELOAD-20-NEXT: call void @func(ptr [[TMP0]]) -; PRELOAD-20-NEXT: ret void -; - call void @func(ptr %0) - ret void -} - -define amdgpu_kernel void @test_preload_hint_kernel_1_call_intrinsic(i16 %0) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic -; NO-PRELOAD-SAME: (i16 [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { -; NO-PRELOAD-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]]) -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic -; PRELOAD-1-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { -; PRELOAD-1-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]]) -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic -; PRELOAD-3-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { -; PRELOAD-3-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]]) -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic -; PRELOAD-16-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { -; PRELOAD-16-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]]) -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic -; PRELOAD-20-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { -; PRELOAD-20-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]]) -; PRELOAD-20-NEXT: ret void -; - call void @llvm.amdgcn.set.prio(i16 %0) - ret void -} - -define spir_kernel void @test_preload_hint_kernel_1_spir_cc(ptr %0) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_spir_cc -; NO-PRELOAD-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_spir_cc -; PRELOAD-1-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_spir_cc -; PRELOAD-3-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_spir_cc -; PRELOAD-16-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_spir_cc -; PRELOAD-20-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -define amdgpu_kernel void @test_preload_hint_kernel_2_preexisting(i32 inreg %0, i64 %1) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2_preexisting -; NO-PRELOAD-SAME: (i32 inreg [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2_preexisting -; PRELOAD-1-SAME: (i32 inreg [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2_preexisting -; PRELOAD-3-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2_preexisting -; PRELOAD-16-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2_preexisting -; PRELOAD-20-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -define amdgpu_kernel void @test_preload_hint_kernel_incompatible_attributes(ptr addrspace(4) byref(i32) %0, ptr nest %1) { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes -; NO-PRELOAD-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes -; PRELOAD-1-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes -; PRELOAD-3-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes -; PRELOAD-16-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes -; PRELOAD-20-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -declare void @func(ptr) #0 -declare void @llvm.amdgcn.set.prio(i16) - -attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll index 0f60888bcb2f5..20858bc603b99 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll @@ -566,13 +566,10 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x ; GFX940-NEXT: .p2align 8 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: .LBB14_0: -; GFX940-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60 -; GFX940-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 +; GFX940-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x40 ; GFX940-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[12:13] ; GFX940-NEXT: v_mov_b32_e32 v0, s8 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1 ; GFX940-NEXT: v_mov_b32_e32 v1, s9 ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: v_mov_b32_e32 v3, s11 @@ -583,6 +580,10 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x ; GFX940-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-NEXT: v_mov_b32_e32 v3, s7 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, s12 +; GFX940-NEXT: v_mov_b32_e32 v1, s13 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3] offset:32 sc0 sc1 ; GFX940-NEXT: s_endpgm ; ; GFX90a-LABEL: v5f64_arg: @@ -593,13 +594,10 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB14_0: -; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 -; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; GFX90a-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 ; GFX90a-NEXT: v_mov_b32_e32 v4, 0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GFX90a-NEXT: v_mov_b32_e32 v0, s12 -; GFX90a-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32 ; GFX90a-NEXT: v_mov_b32_e32 v1, s13 ; GFX90a-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-NEXT: v_mov_b32_e32 v3, s15 @@ -610,6 +608,10 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x ; GFX90a-NEXT: v_mov_b32_e32 v2, s10 ; GFX90a-NEXT: v_mov_b32_e32 v3, s11 ; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX90a-NEXT: s_nop 0 +; GFX90a-NEXT: v_mov_b32_e32 v0, s16 +; GFX90a-NEXT: v_mov_b32_e32 v1, s17 +; GFX90a-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] offset:32 ; GFX90a-NEXT: s_endpgm store <5 x double> %in, ptr addrspace(1) %out, align 8 ret void @@ -941,17 +943,15 @@ define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) inr ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB23_0: -; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 ; GFX90a-NEXT: v_mov_b32_e32 v3, 0 ; GFX90a-NEXT: v_mov_b32_e32 v0, s8 ; GFX90a-NEXT: global_store_short v3, v0, s[6:7] ; GFX90a-NEXT: v_mov_b32_e32 v0, s13 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: global_store_short v3, v0, s[0:1] offset:12 +; GFX90a-NEXT: global_store_short v3, v0, s[14:15] offset:12 ; GFX90a-NEXT: v_mov_b32_e32 v2, s12 ; GFX90a-NEXT: v_mov_b32_e32 v0, s10 ; GFX90a-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[14:15] ; GFX90a-NEXT: s_endpgm store half %in, ptr addrspace(1) %out store <7 x bfloat> %in2, ptr addrspace(1) %out2 @@ -1191,15 +1191,13 @@ define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) inreg % ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB29_0: -; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 ; GFX90a-NEXT: v_mov_b32_e32 v3, 0 ; GFX90a-NEXT: v_mov_b32_e32 v4, s8 ; GFX90a-NEXT: v_mov_b32_e32 v0, s10 ; GFX90a-NEXT: v_mov_b32_e32 v1, s11 ; GFX90a-NEXT: v_mov_b32_e32 v2, s12 ; GFX90a-NEXT: global_store_short v3, v4, s[6:7] -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[14:15] ; GFX90a-NEXT: s_endpgm store i16 %in, ptr addrspace(1) %out store <3 x i32> %in2, ptr addrspace(1) %out2 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index 08cc2e4ec7d79..6288a80446cf0 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -301,7 +301,7 @@ define hidden i32 @called(i32 %a) noinline { ret i32 %sub } -define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { +define amdgpu_kernel void @call(ptr addrspace(8) %tmp14, i32 %arg) { ; GFX9-O0-LABEL: call: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_mov_b32 s32, 0 @@ -533,7 +533,7 @@ define i64 @called_i64(i64 %a) noinline { ret i64 %sub } -define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %arg) { +define amdgpu_kernel void @call_i64(ptr addrspace(8) %tmp14, i64 %arg) { ; GFX9-O0-LABEL: call_i64: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_mov_b32 s32, 0 @@ -1153,7 +1153,7 @@ define hidden i32 @strict_wwm_called(i32 %a) noinline { ret i32 %sub } -define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { +define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) %tmp14, i32 %arg) { ; GFX9-O0-LABEL: strict_wwm_call: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_mov_b32 s32, 0 @@ -1385,7 +1385,7 @@ define i64 @strict_wwm_called_i64(i64 %a) noinline { ret i64 %sub } -define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %arg) { +define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) %tmp14, i64 %arg) { ; GFX9-O0-LABEL: strict_wwm_call_i64: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_mov_b32 s32, 0