diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp index 3033b7f58f1a2..3e5d83b8e3fb1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp @@ -15,10 +15,9 @@ /// SplitModule: load-balance the module's functions across a set of N /// partitions to allow parallel codegen. However, it does it very /// differently than the target-agnostic variant: -/// - Kernels are used as the module's "roots". -/// They're known entry points on AMDGPU, and everything else is often -/// internal only. -/// - Each kernel has a set of dependencies, and when a kernel and its +/// - The module has "split roots", which are kernels in the vast +// majority of cases. +/// - Each root has a set of dependencies, and when a root and its /// dependencies is considered "big", we try to put it in a partition where /// most dependencies are already imported, to avoid duplicating large /// amounts of code. @@ -67,20 +66,22 @@ using namespace llvm; namespace { -static cl::opt LargeKernelFactor( - "amdgpu-module-splitting-large-kernel-threshold", cl::init(2.0f), +static cl::opt LargeFnFactor( + "amdgpu-module-splitting-large-function-threshold", cl::init(2.0f), cl::Hidden, cl::desc( - "consider a kernel as large and needing special treatment when it " + "consider a function as large and needing special treatment when the " + "cost of importing it into a partition" "exceeds the average cost of a partition by this factor; e;g. 2.0 " - "means if the kernel and its dependencies is 2 times bigger than " - "an average partition; 0 disables large kernels handling entirely")); + "means if the function and its dependencies is 2 times bigger than " + "an average partition; 0 disables large functions handling entirely")); -static cl::opt LargeKernelOverlapForMerge( - "amdgpu-module-splitting-large-kernel-merge-overlap", cl::init(0.8f), +static cl::opt LargeFnOverlapForMerge( + "amdgpu-module-splitting-large-function-merge-overlap", cl::init(0.8f), cl::Hidden, - cl::desc("defines how much overlap between two large kernel's dependencies " - "is needed to put them in the same partition")); + cl::desc( + "defines how much overlap between two large function's dependencies " + "is needed to put them in the same partition")); static cl::opt NoExternalizeGlobals( "amdgpu-module-splitting-no-externalize-globals", cl::Hidden, @@ -276,9 +277,9 @@ static bool canBeIndirectlyCalled(const Function &F) { /*IgnoreCastedDirectCall=*/true); } -/// When a kernel or any of its callees performs an indirect call, this function +/// When a function or any of its callees performs an indirect call, this /// takes over \ref addAllDependencies and adds all potentially callable -/// functions to \p Fns so they can be counted as dependencies of the kernel. +/// functions to \p Fns so they can be counted as dependencies of the function. /// /// This is needed due to how AMDGPUResourceUsageAnalysis operates: in the /// presence of an indirect call, the function's resource usage is the same as @@ -300,13 +301,14 @@ static void addAllIndirectCallDependencies(const Module &M, /// \param CG Call graph for \p Fn's module. /// \param Fn Current function to look at. /// \param Fns[out] Resulting list of functions. +/// \param OnlyDirect Whether to only consider direct callees. /// \param HadIndirectCall[out] Set to true if an indirect call was seen at some /// point, either in \p Fn or in one of the function it calls. When that /// happens, we fall back to adding all callable functions inside \p Fn's module /// to \p Fns. static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG, const Function &Fn, - DenseSet &Fns, + DenseSet &Fns, bool OnlyDirect, bool &HadIndirectCall) { assert(!Fn.isDeclaration()); @@ -324,6 +326,9 @@ static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG, auto *CGNode = CGEntry.second; auto *Callee = CGNode->getFunction(); if (!Callee) { + if (OnlyDirect) + continue; + // Functions have an edge towards CallsExternalNode if they're external // declarations, or if they do an indirect call. As we only process // definitions here, we know this means the function has an indirect @@ -352,13 +357,19 @@ static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG, } } -/// Contains information about a kernel and its dependencies. -struct KernelWithDependencies { - KernelWithDependencies(SplitModuleLogger &SML, CallGraph &CG, - const DenseMap &FnCosts, - const Function *Fn) +/// Contains information about a function and its dependencies. +/// This is a splitting root. The splitting algorithm works by +/// assigning these to partitions. +struct FunctionWithDependencies { + FunctionWithDependencies(SplitModuleLogger &SML, CallGraph &CG, + const DenseMap &FnCosts, + const Function *Fn) : Fn(Fn) { - addAllDependencies(SML, CG, *Fn, Dependencies, HasIndirectCall); + // When Fn is not a kernel, we don't need to collect indirect callees. + // Resource usage analysis is only performed on kernels, and we collect + // indirect callees for resource usage analysis. + addAllDependencies(SML, CG, *Fn, Dependencies, + /*OnlyDirect*/ !isEntryPoint(Fn), HasIndirectCall); TotalCost = FnCosts.at(Fn); for (const auto *Dep : Dependencies) { TotalCost += FnCosts.at(Dep); @@ -379,8 +390,8 @@ struct KernelWithDependencies { CostType TotalCost = 0; - /// \returns true if this kernel and its dependencies can be considered large - /// according to \p Threshold. + /// \returns true if this function and its dependencies can be considered + /// large according to \p Threshold. bool isLarge(CostType Threshold) const { return TotalCost > Threshold && !Dependencies.empty(); } @@ -419,40 +430,39 @@ static float calculateOverlap(const DenseSet &A, /// \param NumParts Number of partitions to create. /// \param ModuleCost Total cost of all functions in \p M. /// \param FnCosts Map of Function -> Cost -/// \param WorkList Kernels and their dependencies to process in order. +/// \param WorkList Functions and their dependencies to process in order. /// \returns The created partitions (a vector of size \p NumParts ) static std::vector> doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts, CostType ModuleCost, const DenseMap &FnCosts, - const SmallVector &WorkList) { + const SmallVector &WorkList) { SML << "\n--Partitioning Starts--\n"; - // Calculate a "large kernel threshold". When more than one kernel's total - // import cost exceeds this value, we will try to merge it with other, - // similarly large kernels. + // Calculate a "large function threshold". When more than one function's total + // import cost exceeds this value, we will try to assign it to an existing + // partition to reduce the amount of duplication needed. // - // e.g. let two kernels X and Y have a import cost of ~10% of the module, we + // e.g. let two functions X and Y have a import cost of ~10% of the module, we // assign X to a partition as usual, but when we get to Y, we check if it's // worth also putting it in Y's partition. - const CostType LargeKernelThreshold = - LargeKernelFactor - ? CostType(((ModuleCost / NumParts) * LargeKernelFactor)) - : std::numeric_limits::max(); + const CostType LargeFnThreshold = + LargeFnFactor ? CostType(((ModuleCost / NumParts) * LargeFnFactor)) + : std::numeric_limits::max(); std::vector> Partitions; Partitions.resize(NumParts); - // Assign a partition to each kernel, and try to keep the partitions more or + // Assign functions to partitions, and try to keep the partitions more or // less balanced. We do that through a priority queue sorted in reverse, so we // can always look at the partition with the least content. // // There are some cases where we will be deliberately unbalanced though. - // - Large kernels: we try to merge with existing partitions to reduce code + // - Large functions: we try to merge with existing partitions to reduce code // duplication. - // - Kernels with indirect or external calls always go in the first partition - // (P0). + // - Functions with indirect or external calls always go in the first + // partition (P0). auto ComparePartitions = [](const std::pair &a, const std::pair &b) { // When two partitions have the same cost, assign to the one with the @@ -471,17 +481,17 @@ doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts, for (unsigned I = 0; I < NumParts; ++I) BalancingQueue.push_back(std::make_pair(I, 0)); - // Helper function to handle assigning a kernel to a partition. This takes + // Helper function to handle assigning a function to a partition. This takes // care of updating the balancing queue. const auto AssignToPartition = [&](PartitionID PID, - const KernelWithDependencies &KWD) { + const FunctionWithDependencies &FWD) { auto &FnsInPart = Partitions[PID]; - FnsInPart.insert(KWD.Fn); - FnsInPart.insert(KWD.Dependencies.begin(), KWD.Dependencies.end()); + FnsInPart.insert(FWD.Fn); + FnsInPart.insert(FWD.Dependencies.begin(), FWD.Dependencies.end()); - SML << "assign " << getName(*KWD.Fn) << " to P" << PID << "\n -> "; - if (!KWD.Dependencies.empty()) { - SML << KWD.Dependencies.size() << " dependencies added\n"; + SML << "assign " << getName(*FWD.Fn) << " to P" << PID << "\n -> "; + if (!FWD.Dependencies.empty()) { + SML << FWD.Dependencies.size() << " dependencies added\n"; }; // Update the balancing queue. we scan backwards because in the common case @@ -506,44 +516,43 @@ doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts, sort(BalancingQueue, ComparePartitions); }; - for (auto &CurKernel : WorkList) { - // When a kernel has indirect calls, it must stay in the first partition + for (auto &CurFn : WorkList) { + // When a function has indirect calls, it must stay in the first partition // alongside every reachable non-entry function. This is a nightmare case // for splitting as it severely limits what we can do. - if (CurKernel.HasIndirectCall) { - SML << "Kernel with indirect call(s): " << getName(*CurKernel.Fn) + if (CurFn.HasIndirectCall) { + SML << "Function with indirect call(s): " << getName(*CurFn.Fn) << " defaulting to P0\n"; - AssignToPartition(0, CurKernel); + AssignToPartition(0, CurFn); continue; } - // When a kernel has non duplicatable dependencies, we have to keep it in + // When a function has non duplicatable dependencies, we have to keep it in // the first partition as well. This is a conservative approach, a // finer-grained approach could keep track of which dependencies are // non-duplicatable exactly and just make sure they're grouped together. - if (CurKernel.HasNonDuplicatableDependecy) { - SML << "Kernel with externally visible dependency " - << getName(*CurKernel.Fn) << " defaulting to P0\n"; - AssignToPartition(0, CurKernel); + if (CurFn.HasNonDuplicatableDependecy) { + SML << "Function with externally visible dependency " + << getName(*CurFn.Fn) << " defaulting to P0\n"; + AssignToPartition(0, CurFn); continue; } - // Be smart with large kernels to avoid duplicating their dependencies. - if (CurKernel.isLarge(LargeKernelThreshold)) { - assert(LargeKernelOverlapForMerge >= 0.0f && - LargeKernelOverlapForMerge <= 1.0f); - SML << "Large Kernel: " << getName(*CurKernel.Fn) + // Be smart with large functions to avoid duplicating their dependencies. + if (CurFn.isLarge(LargeFnThreshold)) { + assert(LargeFnOverlapForMerge >= 0.0f && LargeFnOverlapForMerge <= 1.0f); + SML << "Large Function: " << getName(*CurFn.Fn) << " - looking for partition with at least " - << format("%0.2f", LargeKernelOverlapForMerge * 100) << "% overlap\n"; + << format("%0.2f", LargeFnOverlapForMerge * 100) << "% overlap\n"; bool Assigned = false; for (const auto &[PID, Fns] : enumerate(Partitions)) { - float Overlap = calculateOverlap(CurKernel.Dependencies, Fns); + float Overlap = calculateOverlap(CurFn.Dependencies, Fns); SML << " => " << format("%0.2f", Overlap * 100) << "% overlap with P" << PID << '\n'; - if (Overlap > LargeKernelOverlapForMerge) { + if (Overlap > LargeFnOverlapForMerge) { SML << " selecting P" << PID << '\n'; - AssignToPartition(PID, CurKernel); + AssignToPartition(PID, CurFn); Assigned = true; } } @@ -554,41 +563,34 @@ doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts, // Normal "load-balancing", assign to partition with least pressure. auto [PID, CurCost] = BalancingQueue.back(); - AssignToPartition(PID, CurKernel); + AssignToPartition(PID, CurFn); } - // Work is mostly done now, verify the partioning and add all functions we may - // have missed (= unreachable, or we don't understand how they're reached) to - // P0. - DenseSet AllFunctions; - for (const auto &[Idx, Part] : enumerate(Partitions)) { - CostType Cost = 0; - for (auto *Fn : Part) { - // external linkage functions should exclusively be in the first partition - // at this stage. In theory, we should only ever see external linkage - // functions here if they're kernels, or if they've been added due to a - // kernel using indirect calls somewhere in its CallGraph. - assert(Idx == 0 || (!Fn->hasExternalLinkage() || isEntryPoint(Fn))); - Cost += FnCosts.at(Fn); + if (SML) { + for (const auto &[Idx, Part] : enumerate(Partitions)) { + CostType Cost = 0; + for (auto *Fn : Part) + Cost += FnCosts.at(Fn); + SML << "P" << Idx << " has a total cost of " << Cost << " (" + << format("%0.2f", (float(Cost) / ModuleCost) * 100) + << "% of source module)\n"; } - SML << "P" << Idx << " has a total cost of " << Cost << " (" - << format("%0.2f", (float(Cost) / ModuleCost) * 100) - << "% of source module)\n"; - AllFunctions.insert(Part.begin(), Part.end()); + + SML << "--Partitioning Done--\n\n"; } - // Add missed functions to P0. This will take care of adding things like - // external functions with no callers in the module to P0. This should be - // fairly rare as AMDGPU internalizes everything in most cases, so unused - // internal functions would get removed. + // Check no functions were missed. +#ifndef NDEBUG + DenseSet AllFunctions; + for (const auto &Part : Partitions) + AllFunctions.insert(Part.begin(), Part.end()); + for (auto &Fn : M) { if (!Fn.isDeclaration() && !AllFunctions.contains(&Fn)) { - SML << getName(Fn) << " has no partition assigned, defaulting to P0\n"; - Partitions[0].insert(&Fn); + assert(AllFunctions.contains(&Fn) && "Missed a function?!"); } } - - SML << "--Partitioning Done--\n\n"; +#endif return Partitions; } @@ -605,6 +607,14 @@ static void externalize(GlobalValue &GV) { GV.setName("__llvmsplit_unnamed"); } +static bool hasDirectCaller(const Function &Fn) { + for (auto &U : Fn.uses()) { + if (auto *CB = dyn_cast(U.getUser()); CB && CB->isCallee(&U)) + return true; + } + return false; +} + static void splitAMDGPUModule( GetTTIFn GetTTI, Module &M, unsigned N, function_ref MPart)> ModuleCallback) { @@ -649,13 +659,34 @@ static void splitAMDGPUModule( DenseMap FnCosts; const CostType ModuleCost = calculateFunctionCosts(SML, GetTTI, M, FnCosts); - // Gather every kernel into a WorkList, then sort it by descending total cost - // of the kernel so the biggest kernels are seen first. - SmallVector WorkList; + // First, gather ever kernel into the worklist. + SmallVector WorkList; for (auto &Fn : M) { if (isEntryPoint(&Fn) && !Fn.isDeclaration()) WorkList.emplace_back(SML, CG, FnCosts, &Fn); } + + // Then, find missing functions that need to be considered as additional + // roots. These can't be called in theory, but in practice we still have to + // handle them to avoid linker errors. + { + DenseSet SeenFunctions; + for (const auto &FWD : WorkList) { + SeenFunctions.insert(FWD.Fn); + SeenFunctions.insert(FWD.Dependencies.begin(), FWD.Dependencies.end()); + } + + for (auto &Fn : M) { + // If this function is not part of any kernel's dependencies and isn't + // directly called, consider it as a root. + if (!Fn.isDeclaration() && !isEntryPoint(&Fn) && + !SeenFunctions.count(&Fn) && !hasDirectCaller(Fn)) { + WorkList.emplace_back(SML, CG, FnCosts, &Fn); + } + } + } + + // Sort the worklist so the most expensive roots are seen first. sort(WorkList, [&](auto &A, auto &B) { // Sort by total cost, and if the total cost is identical, sort // alphabetically. @@ -666,13 +697,20 @@ static void splitAMDGPUModule( if (SML) { SML << "Worklist\n"; - for (const auto &KWD : WorkList) { - SML << "[Kernel] " << getName(*KWD.Fn) << " (totalCost:" << KWD.TotalCost - << " indirect:" << KWD.HasIndirectCall - << " hasNonDuplicatableDep:" << KWD.HasNonDuplicatableDependecy + for (const auto &FWD : WorkList) { + SML << "[root] " << getName(*FWD.Fn) << " (totalCost:" << FWD.TotalCost + << " indirect:" << FWD.HasIndirectCall + << " hasNonDuplicatableDep:" << FWD.HasNonDuplicatableDependecy << ")\n"; - for (const auto *Dep : KWD.Dependencies) - SML << " [Dep] " << getName(*Dep) << '\n'; + // Sort function names before printing to ensure determinism. + SmallVector SortedDepNames; + SortedDepNames.reserve(FWD.Dependencies.size()); + for (const auto *Dep : FWD.Dependencies) + SortedDepNames.push_back(getName(*Dep)); + sort(SortedDepNames); + + for (const auto &Name : SortedDepNames) + SML << " [dependency] " << Name << '\n'; } } @@ -699,16 +737,8 @@ static void splitAMDGPUModule( std::unique_ptr MPart( CloneModule(M, VMap, [&](const GlobalValue *GV) { // Functions go in their assigned partition. - if (const auto *Fn = dyn_cast(GV)) { -// Check we don't import an external linkage function in any -// partition other than P0. -#ifndef NDEBUG - if (Fn->hasExternalLinkage() && !isEntryPoint(Fn)) { - assert((I == 0) == FnsInPart.contains(Fn)); - } -#endif + if (const auto *Fn = dyn_cast(GV)) return FnsInPart.contains(Fn); - } if (NeedsConservativeImport(GV)) return true; diff --git a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll index 8b76237efa325..d269f92763853 100644 --- a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll +++ b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll @@ -1,4 +1,4 @@ -; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=0 +; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-function-threshold=0 ; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s ; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s ; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s diff --git a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll index 46d7d9783aeae..731cf4b374c95 100644 --- a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll +++ b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll @@ -1,4 +1,4 @@ -; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=0 +; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-function-threshold=0 ; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s ; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s diff --git a/llvm/test/tools/llvm-split/AMDGPU/debug-non-kernel-root.ll b/llvm/test/tools/llvm-split/AMDGPU/debug-non-kernel-root.ll new file mode 100644 index 0000000000000..836b5c05d0653 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/debug-non-kernel-root.ll @@ -0,0 +1,36 @@ +; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -debug 2>&1 | FileCheck %s --implicit-check-not="[root]" +; REQUIRES: asserts + +; func_3 is never directly called, it needs to be considered +; as a root to handle this module correctly. + +; CHECK: [root] kernel_1 +; CHECK-NEXT: [dependency] func_1 +; CHECK-NEXT: [dependency] func_2 +; CHECK-NEXT: [root] func_3 +; CHECK-NEXT: [dependency] func_2 + +define amdgpu_kernel void @kernel_1() { +entry: + call void @func_1() + ret void +} + +define linkonce_odr hidden void @func_1() { +entry: + %call = call i32 @func_2() + ret void +} + +define linkonce_odr hidden i32 @func_2() #0 { +entry: + ret i32 0 +} + +define void @func_3() { +entry: + %call = call i32 @func_2() + ret void +} + +attributes #0 = { noinline optnone } diff --git a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll index 4fdbac7d17897..459c5a7f1a2db 100644 --- a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll +++ b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll @@ -1,9 +1,9 @@ -; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=1.2 -amdgpu-module-splitting-large-kernel-merge-overlap=0.5 +; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-function-threshold=1.2 -amdgpu-module-splitting-large-function-merge-overlap=0.5 ; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s ; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s ; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s -; RUN: llvm-split -o %t.nolarge %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=0 +; RUN: llvm-split -o %t.nolarge %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-function-threshold=0 ; RUN: llvm-dis -o - %t.nolarge0 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK0 %s ; RUN: llvm-dis -o - %t.nolarge1 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK1 %s ; RUN: llvm-dis -o - %t.nolarge2 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK2 %s diff --git a/llvm/test/tools/llvm-split/AMDGPU/non-kernels-dependencies.ll b/llvm/test/tools/llvm-split/AMDGPU/non-kernels-dependencies.ll new file mode 100644 index 0000000000000..f944b7ef65d2d --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/non-kernels-dependencies.ll @@ -0,0 +1,44 @@ +; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=DEFINE %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=DEFINE %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=DEFINE %s + +; 3 functions with each their own dependencies should go into 3 +; distinct partitions. + +; CHECK0: define void @C +; CHECK0: define internal void @HelperC + +; CHECK1: define void @B +; CHECK1: define internal void @HelperB + +; CHECK2: define void @A +; CHECK2: define internal void @HelperA + + +define void @A() { + call void @HelperA() + ret void +} + +define internal void @HelperA() { + ret void +} + +define void @B() { + call void @HelperB() + ret void +} + +define internal void @HelperB() { + ret void +} + +define void @C() { + call void @HelperC() + ret void +} + +define internal void @HelperC() { + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/non-kernels-dependency-indirect.ll b/llvm/test/tools/llvm-split/AMDGPU/non-kernels-dependency-indirect.ll new file mode 100644 index 0000000000000..167930ce0e806 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/non-kernels-dependency-indirect.ll @@ -0,0 +1,72 @@ +; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=DEFINE %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=DEFINE %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=DEFINE %s + +; We have 4 function: +; - Each function has an internal helper +; - @A and @B's helpers does an indirect call. +; +; For non-kernels, indirect calls shouldn't matter, so +; @CallCandidate doesn't have to be in A/B's partition, unlike +; in the corresponding tests for kernels where it has to. + +; CHECK0: define hidden void @HelperA +; CHECK0: define hidden void @HelperB +; CHECK0: define internal void @HelperC +; CHECK0: define internal void @HelperD +; CHECK0: define void @A +; CHECK0: define void @B + +; CHECK1: define internal void @HelperD +; CHECK1: define void @D + +; CHECK2: define hidden void @CallCandidate +; CHECK2: define internal void @HelperC +; CHECK2: define void @C + +@addrthief = global [3 x ptr] [ptr @HelperA, ptr @HelperB, ptr @CallCandidate] + +define internal void @HelperA(ptr %call) { + call void %call() + ret void +} + +define internal void @HelperB(ptr %call) { + call void @HelperC() + call void %call() + call void @HelperD() + ret void +} + +define internal void @CallCandidate() { + ret void +} + +define internal void @HelperC() { + ret void +} + +define internal void @HelperD() { + ret void +} + +define void @A(ptr %call) { + call void @HelperA(ptr %call) + ret void +} + +define void @B(ptr %call) { + call void @HelperB(ptr %call) + ret void +} + +define void @C() { + call void @HelperC() + ret void +} + +define void @D() { + call void @HelperD() + ret void +}