Skip to content

Commit ed52ee4

Browse files
committed
[clang][FMV] Direct-call multi-versioned callees from multi-versioned callers
... when there is a callee with a matching feature set, and no other higher priority callee. This optimization helps the inliner see past the ifunc+resolver to the callee that we know it will always land on. This is a conservative implementation of: llvm/llvm-project#71714
1 parent 70eb0e3 commit ed52ee4

File tree

3 files changed

+318
-1
lines changed

3 files changed

+318
-1
lines changed

clang/lib/CodeGen/CGCall.cpp

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4966,6 +4966,11 @@ static unsigned getMaxVectorWidth(const llvm::Type *Ty) {
49664966
return MaxVectorWidth;
49674967
}
49684968

4969+
// FIXME: put this somewhere nicer to share
4970+
unsigned
4971+
TargetMVPriority(const TargetInfo &TI,
4972+
const CodeGenFunction::MultiVersionResolverOption &RO);
4973+
49694974
RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
49704975
const CGCallee &Callee,
49714976
ReturnValueSlot ReturnValue,
@@ -5437,6 +5442,73 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
54375442
const CGCallee &ConcreteCallee = Callee.prepareConcreteCallee(*this);
54385443
llvm::Value *CalleePtr = ConcreteCallee.getFunctionPointer();
54395444

5445+
// If a multi-versioned caller calls a multi-versioned callee, skip the
5446+
// resolver when there is a precise match on the feature sets, and no
5447+
// possibility of a better match at runtime.
5448+
if (const auto *CallerFD = dyn_cast_or_null<FunctionDecl>(CurGD.getDecl()))
5449+
if (const auto *CallerTVA = CallerFD->getAttr<TargetVersionAttr>())
5450+
if (const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(TargetDecl))
5451+
// FIXME: do the same where either the caller or callee are
5452+
// target_clones.
5453+
if (FD->isTargetMultiVersion()) {
5454+
llvm::SmallVector<StringRef, 8> CallerFeats;
5455+
CallerTVA->getFeatures(CallerFeats);
5456+
MultiVersionResolverOption CallerMVRO(nullptr, "", CallerFeats);
5457+
5458+
bool HasHigherPriorityCallee = false;
5459+
llvm::Constant *FoundMatchingCallee = nullptr;
5460+
getContext().forEachMultiversionedFunctionVersion(
5461+
FD, [this, FD, &CallerMVRO, &HasHigherPriorityCallee,
5462+
&FoundMatchingCallee](const FunctionDecl *CurFD) {
5463+
const auto *CalleeTVA = CurFD->getAttr<TargetVersionAttr>();
5464+
5465+
GlobalDecl CurGD{
5466+
(CurFD->isDefined() ? CurFD->getDefinition() : CurFD)};
5467+
StringRef MangledName = CGM.getMangledName(CurFD);
5468+
5469+
llvm::SmallVector<StringRef, 8> CalleeFeats;
5470+
CalleeTVA->getFeatures(CalleeFeats);
5471+
MultiVersionResolverOption CalleeMVRO(nullptr, "", CalleeFeats);
5472+
5473+
const TargetInfo &TI = getTarget();
5474+
5475+
// If there is a higher priority callee, we can't do the
5476+
// optimization at all, as it would be a valid choice at
5477+
// runtime.
5478+
if (TargetMVPriority(TI, CalleeMVRO) >
5479+
TargetMVPriority(TI, CallerMVRO)) {
5480+
HasHigherPriorityCallee = true;
5481+
return;
5482+
}
5483+
5484+
// FIXME: we could allow a lower-priority match when the
5485+
// features are a proper subset. But for now, to keep things
5486+
// simpler, we only care about a precise match.
5487+
if (TargetMVPriority(TI, CalleeMVRO) <
5488+
TargetMVPriority(TI, CallerMVRO))
5489+
return;
5490+
5491+
if (llvm::Constant *Func = CGM.GetGlobalValue(MangledName)) {
5492+
FoundMatchingCallee = Func;
5493+
return;
5494+
}
5495+
5496+
if (CurFD->isDefined()) {
5497+
// FIXME: not sure how to get the address
5498+
} else {
5499+
const CGFunctionInfo &FI =
5500+
getTypes().arrangeGlobalDeclaration(FD);
5501+
llvm::FunctionType *Ty = getTypes().GetFunctionType(FI);
5502+
FoundMatchingCallee =
5503+
CGM.GetAddrOfFunction(CurGD, Ty, /*ForVTable=*/false,
5504+
/*DontDefer=*/false, ForDefinition);
5505+
}
5506+
});
5507+
5508+
if (FoundMatchingCallee && !HasHigherPriorityCallee)
5509+
CalleePtr = FoundMatchingCallee;
5510+
}
5511+
54405512
// If we're using inalloca, set up that argument.
54415513
if (ArgMemory.isValid()) {
54425514
llvm::Value *Arg = ArgMemory.getPointer();

clang/lib/CodeGen/CodeGenModule.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4092,7 +4092,7 @@ void CodeGenModule::EmitGlobalDefinition(GlobalDecl GD, llvm::GlobalValue *GV) {
40924092
static void ReplaceUsesOfNonProtoTypeWithRealFunction(llvm::GlobalValue *Old,
40934093
llvm::Function *NewFn);
40944094

4095-
static unsigned
4095+
unsigned
40964096
TargetMVPriority(const TargetInfo &TI,
40974097
const CodeGenFunction::MultiVersionResolverOption &RO) {
40984098
unsigned Priority = 0;
Lines changed: 245 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,245 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs
2+
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -emit-llvm -o - %s | FileCheck %s
3+
4+
// Check that we make a direct call from direct_caller._Msimd to
5+
// direct_callee._Msimd when there is no better option.
6+
__attribute__((target_version("simd"))) void direct_callee(void) {}
7+
__attribute__((target_version("default"))) void direct_callee(void) {}
8+
__attribute__((target_version("simd"))) void direct_caller(void) { direct_callee(); }
9+
__attribute__((target_version("default"))) void direct_caller(void) { direct_callee(); }
10+
11+
// ... and that we go through the ifunc+resolver when there is a better option
12+
// that might be chosen at runtime.
13+
__attribute__((target_version("simd"))) void resolved_callee1(void) {}
14+
__attribute__((target_version("fcma"))) void resolved_callee1(void) {}
15+
__attribute__((target_version("default"))) void resolved_callee1(void) {}
16+
__attribute__((target_version("simd"))) void resolved_caller1(void) { resolved_callee1(); }
17+
__attribute__((target_version("default"))) void resolved_caller1(void) { resolved_callee1(); }
18+
19+
// FIXME: we could direct call in cases like this:
20+
__attribute__((target_version("fp"))) void resolved_callee2(void) {}
21+
__attribute__((target_version("default"))) void resolved_callee2(void) {}
22+
__attribute__((target_version("simd+fp"))) void resolved_caller2(void) { resolved_callee2(); }
23+
__attribute__((target_version("default"))) void resolved_caller2(void) { resolved_callee2(); }
24+
25+
void source() {
26+
direct_caller();
27+
resolved_caller1();
28+
resolved_caller2();
29+
}
30+
31+
//.
32+
// CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
33+
// CHECK: @direct_callee.ifunc = weak_odr ifunc void (), ptr @direct_callee.resolver
34+
// CHECK: @direct_caller.ifunc = weak_odr ifunc void (), ptr @direct_caller.resolver
35+
// CHECK: @resolved_callee1.ifunc = weak_odr ifunc void (), ptr @resolved_callee1.resolver
36+
// CHECK: @resolved_caller1.ifunc = weak_odr ifunc void (), ptr @resolved_caller1.resolver
37+
// CHECK: @resolved_callee2.ifunc = weak_odr ifunc void (), ptr @resolved_callee2.resolver
38+
// CHECK: @resolved_caller2.ifunc = weak_odr ifunc void (), ptr @resolved_caller2.resolver
39+
//.
40+
// CHECK: Function Attrs: noinline nounwind optnone
41+
// CHECK-LABEL: define {{[^@]+}}@direct_callee._Msimd
42+
// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
43+
// CHECK-NEXT: entry:
44+
// CHECK-NEXT: ret void
45+
//
46+
//
47+
// CHECK-LABEL: define {{[^@]+}}@direct_callee.resolver() comdat {
48+
// CHECK-NEXT: resolver_entry:
49+
// CHECK-NEXT: call void @__init_cpu_features_resolver()
50+
// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
51+
// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 512
52+
// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 512
53+
// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]]
54+
// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
55+
// CHECK: resolver_return:
56+
// CHECK-NEXT: ret ptr @direct_callee._Msimd
57+
// CHECK: resolver_else:
58+
// CHECK-NEXT: ret ptr @direct_callee.default
59+
//
60+
//
61+
// CHECK: Function Attrs: noinline nounwind optnone
62+
// CHECK-LABEL: define {{[^@]+}}@direct_caller._Msimd
63+
// CHECK-SAME: () #[[ATTR0]] {
64+
// CHECK-NEXT: entry:
65+
// CHECK-NEXT: call void @direct_callee._Msimd()
66+
// CHECK-NEXT: ret void
67+
//
68+
//
69+
// CHECK-LABEL: define {{[^@]+}}@direct_caller.resolver() comdat {
70+
// CHECK-NEXT: resolver_entry:
71+
// CHECK-NEXT: call void @__init_cpu_features_resolver()
72+
// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
73+
// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 512
74+
// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 512
75+
// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]]
76+
// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
77+
// CHECK: resolver_return:
78+
// CHECK-NEXT: ret ptr @direct_caller._Msimd
79+
// CHECK: resolver_else:
80+
// CHECK-NEXT: ret ptr @direct_caller.default
81+
//
82+
//
83+
// CHECK: Function Attrs: noinline nounwind optnone
84+
// CHECK-LABEL: define {{[^@]+}}@resolved_callee1._Msimd
85+
// CHECK-SAME: () #[[ATTR0]] {
86+
// CHECK-NEXT: entry:
87+
// CHECK-NEXT: ret void
88+
//
89+
//
90+
// CHECK-LABEL: define {{[^@]+}}@resolved_callee1.resolver() comdat {
91+
// CHECK-NEXT: resolver_entry:
92+
// CHECK-NEXT: call void @__init_cpu_features_resolver()
93+
// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
94+
// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 2097152
95+
// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 2097152
96+
// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]]
97+
// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
98+
// CHECK: resolver_return:
99+
// CHECK-NEXT: ret ptr @resolved_callee1._Mfcma
100+
// CHECK: resolver_else:
101+
// CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
102+
// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 512
103+
// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 512
104+
// CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]]
105+
// CHECK-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
106+
// CHECK: resolver_return1:
107+
// CHECK-NEXT: ret ptr @resolved_callee1._Msimd
108+
// CHECK: resolver_else2:
109+
// CHECK-NEXT: ret ptr @resolved_callee1.default
110+
//
111+
//
112+
// CHECK: Function Attrs: noinline nounwind optnone
113+
// CHECK-LABEL: define {{[^@]+}}@resolved_caller1._Msimd
114+
// CHECK-SAME: () #[[ATTR0]] {
115+
// CHECK-NEXT: entry:
116+
// CHECK-NEXT: call void @resolved_callee1.ifunc()
117+
// CHECK-NEXT: ret void
118+
//
119+
//
120+
// CHECK-LABEL: define {{[^@]+}}@resolved_caller1.resolver() comdat {
121+
// CHECK-NEXT: resolver_entry:
122+
// CHECK-NEXT: call void @__init_cpu_features_resolver()
123+
// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
124+
// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 512
125+
// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 512
126+
// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]]
127+
// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
128+
// CHECK: resolver_return:
129+
// CHECK-NEXT: ret ptr @resolved_caller1._Msimd
130+
// CHECK: resolver_else:
131+
// CHECK-NEXT: ret ptr @resolved_caller1.default
132+
//
133+
//
134+
// CHECK: Function Attrs: noinline nounwind optnone
135+
// CHECK-LABEL: define {{[^@]+}}@resolved_callee2._Mfp
136+
// CHECK-SAME: () #[[ATTR0]] {
137+
// CHECK-NEXT: entry:
138+
// CHECK-NEXT: ret void
139+
//
140+
//
141+
// CHECK-LABEL: define {{[^@]+}}@resolved_callee2.resolver() comdat {
142+
// CHECK-NEXT: resolver_entry:
143+
// CHECK-NEXT: call void @__init_cpu_features_resolver()
144+
// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
145+
// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 256
146+
// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 256
147+
// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]]
148+
// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
149+
// CHECK: resolver_return:
150+
// CHECK-NEXT: ret ptr @resolved_callee2._Mfp
151+
// CHECK: resolver_else:
152+
// CHECK-NEXT: ret ptr @resolved_callee2.default
153+
//
154+
//
155+
// CHECK: Function Attrs: noinline nounwind optnone
156+
// CHECK-LABEL: define {{[^@]+}}@resolved_caller2._MfpMsimd
157+
// CHECK-SAME: () #[[ATTR0]] {
158+
// CHECK-NEXT: entry:
159+
// CHECK-NEXT: call void @resolved_callee2.ifunc()
160+
// CHECK-NEXT: ret void
161+
//
162+
//
163+
// CHECK-LABEL: define {{[^@]+}}@resolved_caller2.resolver() comdat {
164+
// CHECK-NEXT: resolver_entry:
165+
// CHECK-NEXT: call void @__init_cpu_features_resolver()
166+
// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
167+
// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 768
168+
// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 768
169+
// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]]
170+
// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
171+
// CHECK: resolver_return:
172+
// CHECK-NEXT: ret ptr @resolved_caller2._MfpMsimd
173+
// CHECK: resolver_else:
174+
// CHECK-NEXT: ret ptr @resolved_caller2.default
175+
//
176+
//
177+
// CHECK: Function Attrs: noinline nounwind optnone
178+
// CHECK-LABEL: define {{[^@]+}}@source
179+
// CHECK-SAME: () #[[ATTR1:[0-9]+]] {
180+
// CHECK-NEXT: entry:
181+
// CHECK-NEXT: call void @direct_caller.ifunc()
182+
// CHECK-NEXT: call void @resolved_caller1.ifunc()
183+
// CHECK-NEXT: call void @resolved_caller2.ifunc()
184+
// CHECK-NEXT: ret void
185+
//
186+
//
187+
// CHECK: Function Attrs: noinline nounwind optnone
188+
// CHECK-LABEL: define {{[^@]+}}@direct_callee.default
189+
// CHECK-SAME: () #[[ATTR1]] {
190+
// CHECK-NEXT: entry:
191+
// CHECK-NEXT: ret void
192+
//
193+
//
194+
// CHECK: Function Attrs: noinline nounwind optnone
195+
// CHECK-LABEL: define {{[^@]+}}@direct_caller.default
196+
// CHECK-SAME: () #[[ATTR1]] {
197+
// CHECK-NEXT: entry:
198+
// CHECK-NEXT: call void @direct_callee.ifunc()
199+
// CHECK-NEXT: ret void
200+
//
201+
//
202+
// CHECK: Function Attrs: noinline nounwind optnone
203+
// CHECK-LABEL: define {{[^@]+}}@resolved_callee1._Mfcma
204+
// CHECK-SAME: () #[[ATTR2:[0-9]+]] {
205+
// CHECK-NEXT: entry:
206+
// CHECK-NEXT: ret void
207+
//
208+
//
209+
// CHECK: Function Attrs: noinline nounwind optnone
210+
// CHECK-LABEL: define {{[^@]+}}@resolved_callee1.default
211+
// CHECK-SAME: () #[[ATTR1]] {
212+
// CHECK-NEXT: entry:
213+
// CHECK-NEXT: ret void
214+
//
215+
//
216+
// CHECK: Function Attrs: noinline nounwind optnone
217+
// CHECK-LABEL: define {{[^@]+}}@resolved_caller1.default
218+
// CHECK-SAME: () #[[ATTR1]] {
219+
// CHECK-NEXT: entry:
220+
// CHECK-NEXT: call void @resolved_callee1.ifunc()
221+
// CHECK-NEXT: ret void
222+
//
223+
//
224+
// CHECK: Function Attrs: noinline nounwind optnone
225+
// CHECK-LABEL: define {{[^@]+}}@resolved_callee2.default
226+
// CHECK-SAME: () #[[ATTR1]] {
227+
// CHECK-NEXT: entry:
228+
// CHECK-NEXT: ret void
229+
//
230+
//
231+
// CHECK: Function Attrs: noinline nounwind optnone
232+
// CHECK-LABEL: define {{[^@]+}}@resolved_caller2.default
233+
// CHECK-SAME: () #[[ATTR1]] {
234+
// CHECK-NEXT: entry:
235+
// CHECK-NEXT: call void @resolved_callee2.ifunc()
236+
// CHECK-NEXT: ret void
237+
//
238+
//.
239+
// CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon" }
240+
// CHECK: attributes #[[ATTR1]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
241+
// CHECK: attributes #[[ATTR2]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+complxnum,+fp-armv8,+neon" }
242+
//.
243+
// CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
244+
// CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
245+
//.

0 commit comments

Comments
 (0)