diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 95d0ad0f9dc96..8d6926a494ec7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -13,6 +13,7 @@ #include "llvm/CodeGen/MachinePassManager.h" #include "llvm/IR/PassManager.h" #include "llvm/Pass.h" +#include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/Support/AMDGPUAddrSpace.h" #include "llvm/Support/CodeGen.h" @@ -111,9 +112,9 @@ ModulePass *createAMDGPUCtorDtorLoweringLegacyPass(); void initializeAMDGPUCtorDtorLoweringLegacyPass(PassRegistry &); extern char &AMDGPUCtorDtorLoweringLegacyPassID; -FunctionPass *createAMDGPULowerKernelArgumentsPass(); -void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &); -extern char &AMDGPULowerKernelArgumentsID; +CallGraphSCCPass *createAMDGPULowerKernelArgumentsLegacyPass(const TargetMachine *TM); +void initializeAMDGPULowerKernelArgumentsLegacyPass(PassRegistry &); +extern char &AMDGPULowerKernelArgumentsLegacyPassID; FunctionPass *createAMDGPUPromoteKernelArgumentsPass(); void initializeAMDGPUPromoteKernelArgumentsPass(PassRegistry &); @@ -310,7 +311,7 @@ class AMDGPULowerKernelArgumentsPass public: AMDGPULowerKernelArgumentsPass(TargetMachine &TM) : TM(TM){}; - PreservedAnalyses run(Function &, FunctionAnalysisManager &); + PreservedAnalyses run(Module &, ModuleAnalysisManager &); }; struct AMDGPUAttributorOptions { diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index 6573176492b7f..9fcb90fa0ce8b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -14,6 +14,7 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IRBuilder.h" @@ -131,7 +132,6 @@ class PreloadKernelArgInfo { NF->setAttributes(AL); F.replaceAllUsesWith(NF); - F.setCallingConv(CallingConv::C); return NF; } @@ -169,8 +169,9 @@ class PreloadKernelArgInfo { } // Try to allocate SGPRs to preload implicit kernel arguments. - void tryAllocImplicitArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset, - IRBuilder<> &Builder) { + void tryAllocImplicitArgPreloadSGPRs( + uint64_t ImplicitArgsBaseOffset, IRBuilder<> &Builder, + SmallVectorImpl &FunctionsToErase) { Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists( F.getParent(), Intrinsic::amdgcn_implicitarg_ptr); if (!ImplicitArgPtr) @@ -239,6 +240,7 @@ class PreloadKernelArgInfo { unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second); Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex); assert(NF); + FunctionsToErase.push_back(&F); for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) { LoadInst *LoadInst = I->first; unsigned LoadOffset = I->second; @@ -250,264 +252,313 @@ class PreloadKernelArgInfo { } }; -class AMDGPULowerKernelArguments : public FunctionPass { +class AMDGPULowerKernelArguments { +private: + SmallVector FunctionsToErase; + public: - static char ID; + AMDGPULowerKernelArguments() {} - AMDGPULowerKernelArguments() : FunctionPass(ID) {} + bool eraseTaggedFunctions() { + if (FunctionsToErase.empty()) + return false; - bool runOnFunction(Function &F) override; + for (Function *F : FunctionsToErase) + F->eraseFromParent(); - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.setPreservesAll(); - } -}; + return true; + } -} // end anonymous namespace + // skip allocas + static BasicBlock::iterator getInsertPt(BasicBlock &BB) { + BasicBlock::iterator InsPt = BB.getFirstInsertionPt(); + for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) { + AllocaInst *AI = dyn_cast(&*InsPt); -// skip allocas -static BasicBlock::iterator getInsertPt(BasicBlock &BB) { - BasicBlock::iterator InsPt = BB.getFirstInsertionPt(); - for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) { - AllocaInst *AI = dyn_cast(&*InsPt); + // If this is a dynamic alloca, the value may depend on the loaded kernargs, + // so loads will need to be inserted before it. + if (!AI || !AI->isStaticAlloca()) + break; + } - // If this is a dynamic alloca, the value may depend on the loaded kernargs, - // so loads will need to be inserted before it. - if (!AI || !AI->isStaticAlloca()) - break; + return InsPt; } - return InsPt; -} + bool lowerKernelArguments(Function &F, const TargetMachine &TM) { + CallingConv::ID CC = F.getCallingConv(); + if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty()) + return false; -static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { - CallingConv::ID CC = F.getCallingConv(); - if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty()) - return false; - - const GCNSubtarget &ST = TM.getSubtarget(F); - LLVMContext &Ctx = F.getParent()->getContext(); - const DataLayout &DL = F.getDataLayout(); - BasicBlock &EntryBlock = *F.begin(); - IRBuilder<> Builder(&EntryBlock, getInsertPt(EntryBlock)); - - const Align KernArgBaseAlign(16); // FIXME: Increase if necessary - const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(); - - Align MaxAlign; - // FIXME: Alignment is broken with explicit arg offset.; - const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign); - if (TotalKernArgSize == 0) - return false; - - CallInst *KernArgSegment = - Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, {}, {}, - nullptr, F.getName() + ".kernarg.segment"); - KernArgSegment->addRetAttr(Attribute::NonNull); - KernArgSegment->addRetAttr( - Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize)); - - uint64_t ExplicitArgOffset = 0; - // Preloaded kernel arguments must be sequential. - bool InPreloadSequence = true; - PreloadKernelArgInfo PreloadInfo(F, ST); - - for (Argument &Arg : F.args()) { - const bool IsByRef = Arg.hasByRefAttr(); - Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); - MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt; - Align ABITypeAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy); - - uint64_t Size = DL.getTypeSizeInBits(ArgTy); - uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); - - uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset; - uint64_t LastExplicitArgOffset = ExplicitArgOffset; - ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize; - - // Guard against the situation where hidden arguments have already been - // lowered and added to the kernel function signiture, i.e. in a situation - // where this pass has run twice. - if (Arg.hasAttribute("amdgpu-hidden-argument")) - break; - - // Try to preload this argument into user SGPRs. - if (Arg.hasInRegAttr() && InPreloadSequence && ST.hasKernargPreload() && - !Arg.getType()->isAggregateType()) - if (PreloadInfo.tryAllocPreloadSGPRs(AllocSize, EltOffset, - LastExplicitArgOffset)) - continue; + const GCNSubtarget &ST = TM.getSubtarget(F); + LLVMContext &Ctx = F.getParent()->getContext(); + const DataLayout &DL = F.getDataLayout(); + BasicBlock &EntryBlock = *F.begin(); + IRBuilder<> Builder(&EntryBlock, getInsertPt(EntryBlock)); - InPreloadSequence = false; + const Align KernArgBaseAlign(16); // FIXME: Increase if necessary + const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(); - if (Arg.use_empty()) - continue; + Align MaxAlign; + // FIXME: Alignment is broken with explicit arg offset.; + const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign); + if (TotalKernArgSize == 0) + return false; - // If this is byval, the loads are already explicit in the function. We just - // need to rewrite the pointer values. - if (IsByRef) { - Value *ArgOffsetPtr = Builder.CreateConstInBoundsGEP1_64( - Builder.getInt8Ty(), KernArgSegment, EltOffset, - Arg.getName() + ".byval.kernarg.offset"); + CallInst *KernArgSegment = + Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, {}, {}, + nullptr, F.getName() + ".kernarg.segment"); + KernArgSegment->addRetAttr(Attribute::NonNull); + KernArgSegment->addRetAttr( + Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize)); - Value *CastOffsetPtr = - Builder.CreateAddrSpaceCast(ArgOffsetPtr, Arg.getType()); - Arg.replaceAllUsesWith(CastOffsetPtr); - continue; - } + uint64_t ExplicitArgOffset = 0; + // Preloaded kernel arguments must be sequential. + bool InPreloadSequence = true; + PreloadKernelArgInfo PreloadInfo(F, ST); - if (PointerType *PT = dyn_cast(ArgTy)) { - // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing - // modes on SI to know the high bits are 0 so pointer adds don't wrap. We - // can't represent this with range metadata because it's only allowed for - // integer types. - if ((PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || - PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) && - !ST.hasUsableDSOffset()) - continue; + for (Argument &Arg : F.args()) { + const bool IsByRef = Arg.hasByRefAttr(); + Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); + MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt; + Align ABITypeAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy); + + uint64_t Size = DL.getTypeSizeInBits(ArgTy); + uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); + + uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset; + uint64_t LastExplicitArgOffset = ExplicitArgOffset; + ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize; + + // Guard against the situation where hidden arguments have already been + // lowered and added to the kernel function signiture, i.e. in a situation + // where this pass has run twice. + if (Arg.hasAttribute("amdgpu-hidden-argument")) + break; + + // Try to preload this argument into user SGPRs. + if (Arg.hasInRegAttr() && InPreloadSequence && ST.hasKernargPreload() && + !Arg.getType()->isAggregateType()) + if (PreloadInfo.tryAllocPreloadSGPRs(AllocSize, EltOffset, + LastExplicitArgOffset)) + continue; + + InPreloadSequence = false; - // FIXME: We can replace this with equivalent alias.scope/noalias - // metadata, but this appears to be a lot of work. - if (Arg.hasNoAliasAttr()) + if (Arg.use_empty()) continue; - } - auto *VT = dyn_cast(ArgTy); - bool IsV3 = VT && VT->getNumElements() == 3; - bool DoShiftOpt = Size < 32 && !ArgTy->isAggregateType(); - - VectorType *V4Ty = nullptr; - - int64_t AlignDownOffset = alignDown(EltOffset, 4); - int64_t OffsetDiff = EltOffset - AlignDownOffset; - Align AdjustedAlign = commonAlignment( - KernArgBaseAlign, DoShiftOpt ? AlignDownOffset : EltOffset); - - Value *ArgPtr; - Type *AdjustedArgTy; - if (DoShiftOpt) { // FIXME: Handle aggregate types - // Since we don't have sub-dword scalar loads, avoid doing an extload by - // loading earlier than the argument address, and extracting the relevant - // bits. - // TODO: Update this for GFX12 which does have scalar sub-dword loads. - // - // Additionally widen any sub-dword load to i32 even if suitably aligned, - // so that CSE between different argument loads works easily. - ArgPtr = Builder.CreateConstInBoundsGEP1_64( - Builder.getInt8Ty(), KernArgSegment, AlignDownOffset, - Arg.getName() + ".kernarg.offset.align.down"); - AdjustedArgTy = Builder.getInt32Ty(); - } else { - ArgPtr = Builder.CreateConstInBoundsGEP1_64( - Builder.getInt8Ty(), KernArgSegment, EltOffset, - Arg.getName() + ".kernarg.offset"); - AdjustedArgTy = ArgTy; - } + // If this is byval, the loads are already explicit in the function. We just + // need to rewrite the pointer values. + if (IsByRef) { + Value *ArgOffsetPtr = Builder.CreateConstInBoundsGEP1_64( + Builder.getInt8Ty(), KernArgSegment, EltOffset, + Arg.getName() + ".byval.kernarg.offset"); - if (IsV3 && Size >= 32) { - V4Ty = FixedVectorType::get(VT->getElementType(), 4); - // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads - AdjustedArgTy = V4Ty; - } + Value *CastOffsetPtr = + Builder.CreateAddrSpaceCast(ArgOffsetPtr, Arg.getType()); + Arg.replaceAllUsesWith(CastOffsetPtr); + continue; + } - LoadInst *Load = - Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign); - Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {})); + if (PointerType *PT = dyn_cast(ArgTy)) { + // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing + // modes on SI to know the high bits are 0 so pointer adds don't wrap. We + // can't represent this with range metadata because it's only allowed for + // integer types. + if ((PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) && + !ST.hasUsableDSOffset()) + continue; - MDBuilder MDB(Ctx); + // FIXME: We can replace this with equivalent alias.scope/noalias + // metadata, but this appears to be a lot of work. + if (Arg.hasNoAliasAttr()) + continue; + } - if (isa(ArgTy)) { - if (Arg.hasNonNullAttr()) - Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {})); + auto *VT = dyn_cast(ArgTy); + bool IsV3 = VT && VT->getNumElements() == 3; + bool DoShiftOpt = Size < 32 && !ArgTy->isAggregateType(); + + VectorType *V4Ty = nullptr; + + int64_t AlignDownOffset = alignDown(EltOffset, 4); + int64_t OffsetDiff = EltOffset - AlignDownOffset; + Align AdjustedAlign = commonAlignment( + KernArgBaseAlign, DoShiftOpt ? AlignDownOffset : EltOffset); + + Value *ArgPtr; + Type *AdjustedArgTy; + if (DoShiftOpt) { // FIXME: Handle aggregate types + // Since we don't have sub-dword scalar loads, avoid doing an extload by + // loading earlier than the argument address, and extracting the relevant + // bits. + // TODO: Update this for GFX12 which does have scalar sub-dword loads. + // + // Additionally widen any sub-dword load to i32 even if suitably aligned, + // so that CSE between different argument loads works easily. + ArgPtr = Builder.CreateConstInBoundsGEP1_64( + Builder.getInt8Ty(), KernArgSegment, AlignDownOffset, + Arg.getName() + ".kernarg.offset.align.down"); + AdjustedArgTy = Builder.getInt32Ty(); + } else { + ArgPtr = Builder.CreateConstInBoundsGEP1_64( + Builder.getInt8Ty(), KernArgSegment, EltOffset, + Arg.getName() + ".kernarg.offset"); + AdjustedArgTy = ArgTy; + } - uint64_t DerefBytes = Arg.getDereferenceableBytes(); - if (DerefBytes != 0) { - Load->setMetadata( - LLVMContext::MD_dereferenceable, - MDNode::get(Ctx, - MDB.createConstant( - ConstantInt::get(Builder.getInt64Ty(), DerefBytes)))); + if (IsV3 && Size >= 32) { + V4Ty = FixedVectorType::get(VT->getElementType(), 4); + // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads + AdjustedArgTy = V4Ty; } - uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes(); - if (DerefOrNullBytes != 0) { - Load->setMetadata( - LLVMContext::MD_dereferenceable_or_null, - MDNode::get(Ctx, - MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(), - DerefOrNullBytes)))); + LoadInst *Load = + Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign); + Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {})); + + MDBuilder MDB(Ctx); + + if (isa(ArgTy)) { + if (Arg.hasNonNullAttr()) + Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {})); + + uint64_t DerefBytes = Arg.getDereferenceableBytes(); + if (DerefBytes != 0) { + Load->setMetadata( + LLVMContext::MD_dereferenceable, + MDNode::get(Ctx, + MDB.createConstant( + ConstantInt::get(Builder.getInt64Ty(), DerefBytes)))); + } + + uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes(); + if (DerefOrNullBytes != 0) { + Load->setMetadata( + LLVMContext::MD_dereferenceable_or_null, + MDNode::get(Ctx, + MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(), + DerefOrNullBytes)))); + } + + if (MaybeAlign ParamAlign = Arg.getParamAlign()) { + Load->setMetadata( + LLVMContext::MD_align, + MDNode::get(Ctx, MDB.createConstant(ConstantInt::get( + Builder.getInt64Ty(), ParamAlign->value())))); + } } - if (MaybeAlign ParamAlign = Arg.getParamAlign()) { - Load->setMetadata( - LLVMContext::MD_align, - MDNode::get(Ctx, MDB.createConstant(ConstantInt::get( - Builder.getInt64Ty(), ParamAlign->value())))); + // TODO: Convert noalias arg to !noalias + + if (DoShiftOpt) { + Value *ExtractBits = OffsetDiff == 0 ? + Load : Builder.CreateLShr(Load, OffsetDiff * 8); + + IntegerType *ArgIntTy = Builder.getIntNTy(Size); + Value *Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy); + Value *NewVal = Builder.CreateBitCast(Trunc, ArgTy, + Arg.getName() + ".load"); + Arg.replaceAllUsesWith(NewVal); + } else if (IsV3) { + Value *Shuf = Builder.CreateShuffleVector(Load, ArrayRef{0, 1, 2}, + Arg.getName() + ".load"); + Arg.replaceAllUsesWith(Shuf); + } else { + Load->setName(Arg.getName() + ".load"); + Arg.replaceAllUsesWith(Load); } } - // TODO: Convert noalias arg to !noalias - - if (DoShiftOpt) { - Value *ExtractBits = OffsetDiff == 0 ? - Load : Builder.CreateLShr(Load, OffsetDiff * 8); - - IntegerType *ArgIntTy = Builder.getIntNTy(Size); - Value *Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy); - Value *NewVal = Builder.CreateBitCast(Trunc, ArgTy, - Arg.getName() + ".load"); - Arg.replaceAllUsesWith(NewVal); - } else if (IsV3) { - Value *Shuf = Builder.CreateShuffleVector(Load, ArrayRef{0, 1, 2}, - Arg.getName() + ".load"); - Arg.replaceAllUsesWith(Shuf); - } else { - Load->setName(Arg.getName() + ".load"); - Arg.replaceAllUsesWith(Load); + KernArgSegment->addRetAttr( + Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign))); + + if (InPreloadSequence) { + uint64_t ImplicitArgsBaseOffset = + alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) + + BaseOffset; + PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset, + Builder, FunctionsToErase); } + + return true; } - KernArgSegment->addRetAttr( - Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign))); + bool runOnSCC(CallGraphSCC &SCC, const TargetMachine &TM) { + bool Changed = false; + for (CallGraphNode *I : SCC) { + Function *F = I->getFunction(); + if (!F || F->isDeclaration()) + continue; + + Changed |= lowerKernelArguments(*F, TM); + } - if (InPreloadSequence) { - uint64_t ImplicitArgsBaseOffset = - alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) + - BaseOffset; - PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset, - Builder); + return Changed; } - return true; -} + bool runOnModule(Module &M, const TargetMachine &TM) { + bool Changed = false; -bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { - auto &TPC = getAnalysis(); - const TargetMachine &TM = TPC.getTM(); - return lowerKernelArguments(F, TM); -} + for (Function &F : M) + Changed |= lowerKernelArguments(F, TM); + + Changed |= eraseTaggedFunctions(); + + return Changed; + } +}; + +class AMDGPULowerKernelArgumentsLegacy : public CallGraphSCCPass { +private: + AMDGPULowerKernelArguments Impl; + +public: + static char ID; + const TargetMachine *TM; + + AMDGPULowerKernelArgumentsLegacy(const TargetMachine *TM = nullptr) + : CallGraphSCCPass(ID), TM(TM) {} + + bool runOnSCC(CallGraphSCC &SCC) override { + if (!TM) { + auto &TPC = getAnalysis(); + TM = &TPC.getTM(); + } + + return Impl.runOnSCC(SCC, *TM); + } + + bool doFinalization(CallGraph &CG) override { + return Impl.eraseTaggedFunctions(); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + if (!TM) + AU.addRequired(); + + AU.setPreservesAll(); + } +}; -INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE, +} // end anonymous namespace + +INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArgumentsLegacy, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments", false, false) -INITIALIZE_PASS_END(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments", - false, false) +INITIALIZE_PASS_END(AMDGPULowerKernelArgumentsLegacy, DEBUG_TYPE, + "AMDGPU Lower Kernel Arguments", false, false) -char AMDGPULowerKernelArguments::ID = 0; +char AMDGPULowerKernelArgumentsLegacy::ID = 0; -FunctionPass *llvm::createAMDGPULowerKernelArgumentsPass() { - return new AMDGPULowerKernelArguments(); +CallGraphSCCPass * +llvm::createAMDGPULowerKernelArgumentsLegacyPass(const TargetMachine *TM) { + return new AMDGPULowerKernelArgumentsLegacy(TM); } PreservedAnalyses -AMDGPULowerKernelArgumentsPass::run(Function &F, FunctionAnalysisManager &AM) { - bool Changed = lowerKernelArguments(F, TM); - if (Changed) { - // TODO: Preserves a lot more. - PreservedAnalyses PA; - PA.preserveSet(); - return PA; - } - - return PreservedAnalyses::all(); +AMDGPULowerKernelArgumentsPass::run(Module &M, ModuleAnalysisManager &AM) { + return AMDGPULowerKernelArguments().runOnModule(M, TM) + ? PreservedAnalyses::none() + : PreservedAnalyses::all(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 174a90f0aa419..e27ccf36ac3a9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -27,6 +27,8 @@ MODULE_PASS("amdgpu-perf-hint", *static_cast(this))) MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass()) MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass()) +MODULE_PASS("amdgpu-lower-kernel-arguments", + AMDGPULowerKernelArgumentsPass(*this)) #undef MODULE_PASS #ifndef MODULE_PASS_WITH_PARAMS @@ -50,8 +52,6 @@ FUNCTION_PASS("amdgpu-image-intrinsic-opt", FUNCTION_PASS("amdgpu-late-codegenprepare", AMDGPULateCodeGenPreparePass( *static_cast(this))) -FUNCTION_PASS("amdgpu-lower-kernel-arguments", - AMDGPULowerKernelArgumentsPass(*this)) FUNCTION_PASS("amdgpu-lower-kernel-attributes", AMDGPULowerKernelAttributesPass()) FUNCTION_PASS("amdgpu-simplifylib", AMDGPUSimplifyLibCallsPass()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 786baa6820e86..ce673bf8a3e31 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -484,7 +484,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR); initializeAMDGPUArgumentUsageInfoPass(*PR); initializeAMDGPUAtomicOptimizerPass(*PR); - initializeAMDGPULowerKernelArgumentsPass(*PR); + initializeAMDGPULowerKernelArgumentsLegacyPass(*PR); initializeAMDGPUPromoteKernelArgumentsPass(*PR); initializeAMDGPULowerKernelAttributesPass(*PR); initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); @@ -1214,7 +1214,7 @@ void AMDGPUPassConfig::addCodeGenPrepare() { if (TM->getTargetTriple().getArch() == Triple::amdgcn && EnableLowerKernelArguments) - addPass(createAMDGPULowerKernelArgumentsPass()); + addPass(createAMDGPULowerKernelArgumentsLegacyPass(TM)); if (TM->getTargetTriple().getArch() == Triple::amdgcn) { // This lowering has been placed after codegenprepare to take advantage of diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index c0a87cf4ceacf..df872910cbee6 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -50,8 +50,7 @@ ; GCN-O0-NEXT: CallGraph Construction ; GCN-O0-NEXT: Call Graph SCC Pass Manager ; GCN-O0-NEXT: AMDGPU Annotate Kernel Features -; GCN-O0-NEXT: FunctionPass Manager -; GCN-O0-NEXT: AMDGPU Lower Kernel Arguments +; GCN-O0-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O0-NEXT: Lower buffer fat pointer operations to buffer resources ; GCN-O0-NEXT: CallGraph Construction ; GCN-O0-NEXT: Call Graph SCC Pass Manager @@ -230,8 +229,7 @@ ; GCN-O1-NEXT: CallGraph Construction ; GCN-O1-NEXT: Call Graph SCC Pass Manager ; GCN-O1-NEXT: AMDGPU Annotate Kernel Features -; GCN-O1-NEXT: FunctionPass Manager -; GCN-O1-NEXT: AMDGPU Lower Kernel Arguments +; GCN-O1-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O1-NEXT: Lower buffer fat pointer operations to buffer resources ; GCN-O1-NEXT: CallGraph Construction ; GCN-O1-NEXT: Call Graph SCC Pass Manager @@ -524,8 +522,7 @@ ; GCN-O1-OPTS-NEXT: CallGraph Construction ; GCN-O1-OPTS-NEXT: Call Graph SCC Pass Manager ; GCN-O1-OPTS-NEXT: AMDGPU Annotate Kernel Features -; GCN-O1-OPTS-NEXT: FunctionPass Manager -; GCN-O1-OPTS-NEXT: AMDGPU Lower Kernel Arguments +; GCN-O1-OPTS-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O1-OPTS-NEXT: Lower buffer fat pointer operations to buffer resources ; GCN-O1-OPTS-NEXT: CallGraph Construction ; GCN-O1-OPTS-NEXT: Call Graph SCC Pass Manager @@ -836,8 +833,7 @@ ; GCN-O2-NEXT: CallGraph Construction ; GCN-O2-NEXT: Call Graph SCC Pass Manager ; GCN-O2-NEXT: AMDGPU Annotate Kernel Features -; GCN-O2-NEXT: FunctionPass Manager -; GCN-O2-NEXT: AMDGPU Lower Kernel Arguments +; GCN-O2-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O2-NEXT: Lower buffer fat pointer operations to buffer resources ; GCN-O2-NEXT: CallGraph Construction ; GCN-O2-NEXT: Call Graph SCC Pass Manager @@ -1163,8 +1159,7 @@ ; GCN-O3-NEXT: CallGraph Construction ; GCN-O3-NEXT: Call Graph SCC Pass Manager ; GCN-O3-NEXT: AMDGPU Annotate Kernel Features -; GCN-O3-NEXT: FunctionPass Manager -; GCN-O3-NEXT: AMDGPU Lower Kernel Arguments +; GCN-O3-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O3-NEXT: Lower buffer fat pointer operations to buffer resources ; GCN-O3-NEXT: CallGraph Construction ; GCN-O3-NEXT: Call Graph SCC Pass Manager diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll index aeb7faade4715..ad23b24ce9ff1 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -S < %s | FileCheck -check-prefix=NO-PRELOAD %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=16 -S < %s | FileCheck -check-prefix=PRELOAD %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes='amdgpu-attributor,amdgpu-lower-kernel-arguments' -S < %s | FileCheck -check-prefix=NO-PRELOAD %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes='amdgpu-attributor,amdgpu-lower-kernel-arguments' -amdgpu-kernarg-preload-count=16 -S < %s | FileCheck -check-prefix=PRELOAD %s define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) %out) { ; NO-PRELOAD-LABEL: define amdgpu_kernel void @preload_block_count_x( @@ -27,6 +27,11 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) %out) { ret void } +; Preloading hidden arguments involves cloning functions to rewrite the kernel +; signature. Verify that cloned function was deleted. + +; PRELOAD-NOT: declare {{.*}}@0 + define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, i512) { ; NO-PRELOAD-LABEL: define amdgpu_kernel void @no_free_sgprs_block_count_x( ; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]], i512 [[TMP0:%.*]]) #[[ATTR0]] { @@ -83,6 +88,8 @@ define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) %out) { ret void } +; PRELOAD-NOT: declare {{.*}}@1 + define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) %out) { ; NO-PRELOAD-LABEL: define amdgpu_kernel void @preload_workgroup_size_xyz( ; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { @@ -141,6 +148,8 @@ define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) %out) { ret void } +; PRELOAD-NOT: declare {{.*}}@2 + define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inreg %out) { ; NO-PRELOAD-LABEL: define amdgpu_kernel void @incorrect_type_i64_block_count_x( ; NO-PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {