Skip to content

[SystemZ] Remove high inlining threshold multiplier. #106058

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Oct 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 71 additions & 5 deletions llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,17 +53,83 @@ static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse) {
return UsedAsMemCpySource;
}

static void countNumMemAccesses(const Value *Ptr, unsigned &NumStores,
unsigned &NumLoads, const Function *F) {
if (!isa<PointerType>(Ptr->getType()))
return;
for (const User *U : Ptr->users())
if (const Instruction *User = dyn_cast<Instruction>(U)) {
if (User->getParent()->getParent() == F) {
if (const auto *SI = dyn_cast<StoreInst>(User)) {
if (SI->getPointerOperand() == Ptr && !SI->isVolatile())
NumStores++;
} else if (const auto *LI = dyn_cast<LoadInst>(User)) {
if (LI->getPointerOperand() == Ptr && !LI->isVolatile())
NumLoads++;
} else if (const auto *GEP = dyn_cast<GetElementPtrInst>(User)) {
if (GEP->getPointerOperand() == Ptr)
countNumMemAccesses(GEP, NumStores, NumLoads, F);
}
}
}
}

unsigned SystemZTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
unsigned Bonus = 0;
const Function *Caller = CB->getParent()->getParent();
const Function *Callee = CB->getCalledFunction();
if (!Callee)
return 0;
const Module *M = Caller->getParent();

// Increase the threshold if an incoming argument is used only as a memcpy
// source.
if (Function *Callee = CB->getCalledFunction())
for (Argument &Arg : Callee->args()) {
bool OtherUse = false;
if (isUsedAsMemCpySource(&Arg, OtherUse) && !OtherUse)
Bonus += 150;
for (const Argument &Arg : Callee->args()) {
bool OtherUse = false;
if (isUsedAsMemCpySource(&Arg, OtherUse) && !OtherUse) {
Bonus = 1000;
break;
}
}

// Give bonus for globals used much in both caller and callee.
std::set<const GlobalVariable *> CalleeGlobals;
std::set<const GlobalVariable *> CallerGlobals;
for (const GlobalVariable &Global : M->globals())
for (const User *U : Global.users())
if (const Instruction *User = dyn_cast<Instruction>(U)) {
if (User->getParent()->getParent() == Callee)
CalleeGlobals.insert(&Global);
if (User->getParent()->getParent() == Caller)
CallerGlobals.insert(&Global);
}
for (auto *GV : CalleeGlobals)
if (CallerGlobals.count(GV)) {
unsigned CalleeStores = 0, CalleeLoads = 0;
unsigned CallerStores = 0, CallerLoads = 0;
countNumMemAccesses(GV, CalleeStores, CalleeLoads, Callee);
countNumMemAccesses(GV, CallerStores, CallerLoads, Caller);
if ((CalleeStores + CalleeLoads) > 10 &&
(CallerStores + CallerLoads) > 10) {
Bonus = 1000;
break;
}
}

// Give bonus when Callee accesses an Alloca of Caller heavily.
unsigned NumStores = 0;
unsigned NumLoads = 0;
for (unsigned OpIdx = 0; OpIdx != Callee->arg_size(); ++OpIdx) {
Value *CallerArg = CB->getArgOperand(OpIdx);
Argument *CalleeArg = Callee->getArg(OpIdx);
if (isa<AllocaInst>(CallerArg))
countNumMemAccesses(CalleeArg, NumStores, NumLoads, Callee);
}
if (NumLoads > 10)
Bonus += NumLoads * 50;
if (NumStores > 10)
Bonus += NumStores * 50;
Bonus = std::min(Bonus, unsigned(1000));

LLVM_DEBUG(if (Bonus)
dbgs() << "++ SZTTI Adding inlining bonus: " << Bonus << "\n";);
Expand Down
1 change: 0 additions & 1 deletion llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
/// \name Scalar TTI Implementations
/// @{

unsigned getInliningThresholdMultiplier() const { return 3; }
unsigned adjustInliningThreshold(const CallBase *CB) const;

InstructionCost getIntImmCost(const APInt &Imm, Type *Ty,
Expand Down
139 changes: 136 additions & 3 deletions llvm/test/CodeGen/SystemZ/inline-thresh-adjust.ll
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
; RUN: opt < %s -mtriple=systemz-unknown -mcpu=z15 -passes='cgscc(inline)' -disable-output \
; RUN: -debug-only=inline,systemztti 2>&1 | FileCheck %s
; REQUIRES: asserts
;

; Check that the inlining threshold is incremented for a function using an
; argument only as a memcpy source.

;
; CHECK: Inlining calls in: root_function
; CHECK: Inlining {{.*}} Call: call void @leaf_function_A(ptr %Dst)
; CHECK: ++ SZTTI Adding inlining bonus: 150
; CHECK: ++ SZTTI Adding inlining bonus: 1000
; CHECK: Inlining {{.*}} Call: call void @leaf_function_B(ptr %Dst, ptr %Src)

define void @leaf_function_A(ptr %Dst) {
Expand All @@ -30,3 +30,136 @@ entry:
}

declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)

; Check that the inlining threshold is incremented in case of multiple
; accesses of a global variable by both caller and callee (which is true here
; after the first call is inlined).
;
; CHECK: Inlining calls in: Caller1
; CHECK: ++ SZTTI Adding inlining bonus: 1000

@GlobV = external global i32

define i64 @Caller1(i1 %cond1, i32 %0) #0 {
entry:
br i1 %cond1, label %sw.bb3437, label %fake_end

common.ret: ; preds = %fake_end, %sw.bb3437
ret i64 0

sw.bb3437: ; preds = %entry
%call34652 = call i32 @Callee1(ptr null, i32 %0)
br label %common.ret

fake_end: ; preds = %entry
%call57981 = call i32 @Callee1(ptr null, i32 0)
br label %common.ret
}

define i32 @Callee1(ptr %rex, i32 %parenfloor) #0 {
entry:
%cmp21 = icmp slt i32 %parenfloor, 0
br i1 %cmp21, label %for.body, label %for.end

common.ret: ; preds = %for.end, %for.body
ret i32 0

for.body: ; preds = %entry
%0 = load i32, ptr @GlobV, align 4
%inc = or i32 %0, 1
store i32 %inc, ptr @GlobV, align 4
store i64 0, ptr %rex, align 8
%1 = load i32, ptr @GlobV, align 4
%inc28 = or i32 %1, 1
store i32 %inc28, ptr @GlobV, align 4
store i64 0, ptr %rex, align 8
%2 = load i32, ptr @GlobV, align 4
%inc35 = or i32 %2, 1
store i32 %inc35, ptr @GlobV, align 4
store i32 0, ptr %rex, align 8
br label %common.ret

for.end: ; preds = %entry
store i32 0, ptr @GlobV, align 4
store i32 0, ptr %rex, align 8
%3 = load i32, ptr @GlobV, align 4
%inc42 = or i32 %3, 1
store i32 %inc42, ptr @GlobV, align 4
store i32 0, ptr %rex, align 8
%4 = load i32, ptr @GlobV, align 4
%inc48 = or i32 %4, 1
store i32 %inc48, ptr @GlobV, align 4
br label %common.ret
}

; Check that the inlining threshold is incremented for a function that is
; accessing an alloca of the caller multiple times.
;
; CHECK: Inlining calls in: Caller2
; CHECK: ++ SZTTI Adding inlining bonus: 550

define i1 @Caller2() {
entry:
%A = alloca [80 x i64], align 8
call void @Callee2(ptr %A)
ret i1 false
}

define void @Callee2(ptr nocapture readonly %Arg) {
entry:
%nonzero = getelementptr i8, ptr %Arg, i64 48
%0 = load i32, ptr %nonzero, align 8
%tobool1.not = icmp eq i32 %0, 0
br i1 %tobool1.not, label %if.else38, label %if.then2

if.then2: ; preds = %entry
%1 = load i32, ptr %Arg, align 4
%tobool4.not = icmp eq i32 %1, 0
br i1 %tobool4.not, label %common.ret, label %if.then5

if.then5: ; preds = %if.then2
%2 = load double, ptr %Arg, align 8
%slab_den = getelementptr i8, ptr %Arg, i64 24
%3 = load double, ptr %slab_den, align 8
%mul = fmul double %2, %3
%cmp = fcmp olt double %mul, 0.000000e+00
br i1 %cmp, label %common.ret, label %if.end55

common.ret: ; preds = %if.end100, %if.else79, %if.end55, %if.else38, %if.then5, %if.then2
ret void

if.else38: ; preds = %entry
%4 = load double, ptr %Arg, align 8
%cmp52 = fcmp ogt double %4, 0.000000e+00
br i1 %cmp52, label %common.ret, label %if.end55

if.end55: ; preds = %if.else38, %if.then5
%arrayidx57 = getelementptr i8, ptr %Arg, i64 52
%5 = load i32, ptr %arrayidx57, align 4
%tobool58.not = icmp eq i32 %5, 0
br i1 %tobool58.not, label %common.ret, label %if.then59

if.then59: ; preds = %if.end55
%arrayidx61 = getelementptr i8, ptr %Arg, i64 64
%6 = load i32, ptr %arrayidx61, align 4
%tobool62.not = icmp eq i32 %6, 0
br i1 %tobool62.not, label %if.else79, label %if.end100

if.else79: ; preds = %if.then59
%arrayidx84 = getelementptr i8, ptr %Arg, i64 8
%7 = load double, ptr %arrayidx84, align 8
%arrayidx87 = getelementptr i8, ptr %Arg, i64 32
%8 = load double, ptr %arrayidx87, align 8
%mul88 = fmul double %7, %8
%9 = fcmp olt double %mul88, 0.000000e+00
br i1 %9, label %common.ret, label %if.end100

if.end100: ; preds = %if.else79, %if.then59
%arrayidx151 = getelementptr i8, ptr %Arg, i64 16
%10 = load double, ptr %arrayidx151, align 8
%arrayidx154 = getelementptr i8, ptr %Arg, i64 40
%11 = load double, ptr %arrayidx154, align 8
%mul155 = fmul double %10, %11
%cmp181 = fcmp olt double %mul155, 0.000000e+00
br label %common.ret
}
Loading