-
Notifications
You must be signed in to change notification settings - Fork 13.5k
Try to use non-volatile registers for preserve_none
parameters
#88333
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Thank you for submitting a Pull Request (PR) to the LLVM Project! This PR will be automatically labeled and the relevant teams will be If you wish to, you can add reviewers by using the "Reviewers" section on this page. If this is not working for you, it is probably because you do not have write If you have received no comments on your PR for a week, you can request a review If you have further questions, they may be answered by the LLVM GitHub User Guide. You can also ask questions in a comment on this PR, on the LLVM Discord or on the forums. |
@llvm/pr-subscribers-clang @llvm/pr-subscribers-backend-x86 Author: Brandt Bucher (brandtbucher) ChangesThis uses non-volatile registers for the first four (six on Windows) registers used for Example: void boring(void);
__attribute__((preserve_none)) void (continuation)(void *, void *, void *, void *);
__attribute__((preserve_none)) void entry(void *a, void *b, void *c, void *d)
{
boring();
__attribute__((musttail)) return continuation(a, b, c, d);
} Before: pushq %rax
movq %rcx, %rbx
movq %rdx, %r14
movq %rsi, %r15
movq %rdi, %r12
callq boring@<!-- -->PLT
movq %r12, %rdi
movq %r15, %rsi
movq %r14, %rdx
movq %rbx, %rcx
popq %rax
jmp continuation@<!-- -->PLT After: pushq %rax
callq boring@<!-- -->PLT
popq %rax
jmp continuation@<!-- -->PLT Full diff: https://github.com/llvm/llvm-project/pull/88333.diff 3 Files Affected:
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
index 12178bcaf042db..9ec68bfb8e0f7e 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -1063,11 +1063,13 @@ def CC_X86_64_Preserve_None : CallingConv<[
// - R10 'nest' parameter
// - RBX base pointer
// - R16 - R31 these are not available everywhere
- CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D,
- R11D, R12D, R13D, R14D, R15D, EAX]>>,
+ // Use non-volatile registers first, so functions using this convention can
+ // call "normal" functions without saving and restoring incoming values:
+ CCIfType<[i32], CCAssignToReg<[R12D, R13D, R14D, R15D, EDI, ESI,
+ EDX, ECX, R8D, R9D, R11D, EAX]>>,
- CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8, R9,
- R11, R12, R13, R14, R15, RAX]>>,
+ CCIfType<[i64], CCAssignToReg<[R12, R13, R14, R15, RDI, RSI,
+ RDX, RCX, R8, R9, R11, RAX]>>,
// Otherwise it's the same as the regular C calling convention.
CCDelegateTo<CC_X86_64_C>
diff --git a/llvm/test/CodeGen/X86/preserve_nonecc_call.ll b/llvm/test/CodeGen/X86/preserve_nonecc_call.ll
index e4ad056913c5dc..500ebb139811aa 100644
--- a/llvm/test/CodeGen/X86/preserve_nonecc_call.ll
+++ b/llvm/test/CodeGen/X86/preserve_nonecc_call.ll
@@ -27,6 +27,7 @@ define void @caller1(ptr %a) {
; CHECK-NEXT: .cfi_offset %r13, -32
; CHECK-NEXT: .cfi_offset %r14, -24
; CHECK-NEXT: .cfi_offset %r15, -16
+; CHECK-NEXT: movq %rdi, %r12
; CHECK-NEXT: callq callee@PLT
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: .cfi_def_cfa_offset 40
@@ -61,17 +62,17 @@ define preserve_nonecc i64 @callee_with_many_param(i64 %a1, i64 %a2, i64 %a3, i6
; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: movq %r13, %r12
+; CHECK-NEXT: movq %r14, %r13
+; CHECK-NEXT: movq %r15, %r14
+; CHECK-NEXT: movq %rdi, %r15
; CHECK-NEXT: movq %rsi, %rdi
; CHECK-NEXT: movq %rdx, %rsi
; CHECK-NEXT: movq %rcx, %rdx
; CHECK-NEXT: movq %r8, %rcx
; CHECK-NEXT: movq %r9, %r8
; CHECK-NEXT: movq %r11, %r9
-; CHECK-NEXT: movq %r12, %r11
-; CHECK-NEXT: movq %r13, %r12
-; CHECK-NEXT: movq %r14, %r13
-; CHECK-NEXT: movq %r15, %r14
-; CHECK-NEXT: movq %rax, %r15
+; CHECK-NEXT: movq %rax, %r11
; CHECK-NEXT: callq callee_with_many_param2@PLT
; CHECK-NEXT: popq %rcx
; CHECK-NEXT: .cfi_def_cfa_offset 8
@@ -98,17 +99,17 @@ define i64 @caller3() {
; CHECK-NEXT: .cfi_offset %r13, -32
; CHECK-NEXT: .cfi_offset %r14, -24
; CHECK-NEXT: .cfi_offset %r15, -16
-; CHECK-NEXT: movl $1, %edi
-; CHECK-NEXT: movl $2, %esi
-; CHECK-NEXT: movl $3, %edx
-; CHECK-NEXT: movl $4, %ecx
-; CHECK-NEXT: movl $5, %r8d
-; CHECK-NEXT: movl $6, %r9d
-; CHECK-NEXT: movl $7, %r11d
-; CHECK-NEXT: movl $8, %r12d
-; CHECK-NEXT: movl $9, %r13d
-; CHECK-NEXT: movl $10, %r14d
-; CHECK-NEXT: movl $11, %r15d
+; CHECK-NEXT: movl $1, %r12d
+; CHECK-NEXT: movl $2, %r13d
+; CHECK-NEXT: movl $3, %r14d
+; CHECK-NEXT: movl $4, %r15d
+; CHECK-NEXT: movl $5, %edi
+; CHECK-NEXT: movl $6, %esi
+; CHECK-NEXT: movl $7, %edx
+; CHECK-NEXT: movl $8, %ecx
+; CHECK-NEXT: movl $9, %r8d
+; CHECK-NEXT: movl $10, %r9d
+; CHECK-NEXT: movl $11, %r11d
; CHECK-NEXT: movl $12, %eax
; CHECK-NEXT: callq callee_with_many_param@PLT
; CHECK-NEXT: popq %rbx
@@ -125,3 +126,20 @@ define i64 @caller3() {
%ret = call preserve_nonecc i64 @callee_with_many_param(i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12)
ret i64 %ret
}
+
+; Non-volatile registers are used to pass the first few parameters.
+declare void @boring()
+declare preserve_nonecc void @continuation(ptr, ptr, ptr, ptr)
+define preserve_nonecc void @entry(ptr %r12, ptr %r13, ptr %r14, ptr %r15) {
+; CHECK-LABEL: entry:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: callq boring@PLT
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: jmp continuation@PLT # TAILCALL
+ call void @boring()
+ musttail call preserve_nonecc void @continuation(ptr %r12, ptr %r13, ptr %r14, ptr %r15)
+ ret void
+}
diff --git a/llvm/test/CodeGen/X86/preserve_nonecc_call_win.ll b/llvm/test/CodeGen/X86/preserve_nonecc_call_win.ll
new file mode 100644
index 00000000000000..232ac345057825
--- /dev/null
+++ b/llvm/test/CodeGen/X86/preserve_nonecc_call_win.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=x86_64-pc-windows-msvc -mcpu=corei7 < %s | FileCheck %s
+
+; Non-volatile registers are used to pass the first few parameters.
+declare void @boring()
+declare preserve_nonecc void @continuation(ptr, ptr, ptr, ptr, ptr, ptr)
+define preserve_nonecc void @entry(ptr %r12, ptr %r13, ptr %r14, ptr %r15, ptr %rdi, ptr %rsi) {
+; CHECK-LABEL: entry:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq $40, %rsp
+; CHECK-NEXT: .seh_stackalloc 40
+; CHECK-NEXT: .seh_endprologue
+; CHECK-NEXT: callq boring
+; CHECK-NEXT: nop
+; CHECK-NEXT: addq $40, %rsp
+; CHECK-NEXT: jmp continuation # TAILCALL
+; CHECK-NEXT: .seh_endproc
+ call void @boring()
+ musttail call preserve_nonecc void @continuation(ptr %r12, ptr %r13, ptr %r14, ptr %r15, ptr %rdi, ptr %rsi)
+ ret void
+}
|
Can you squash these commits? There shouldn't be a commit with just "Try" as the commit message in LLVM. |
90cf5f0
to
8ff8000
Compare
Use non-volatile registers for the first four (six on Windows) registers used for preserve_none argument passing. This allows these registers to stay "pinned" across a chain of tail calls, even when the body of a preserve_none function contains other calls.
8ff8000
to
f797b69
Compare
Done! |
@weiguozhi, @david-xl: Any chance you could review this small change? I think it'd be better to make it sooner rather than later, since it would be an ABI break once LLVM 19 ships. (Sorry for the ping, I don't have permission to add reviewers.) |
@weiguozhi will help with some performance measurement with this change and get back. |
Thanks, I appreciate it! |
I ran the udp protobuf parsing micro benchmarks on my desktop. Following are the results. Base uses the original preserve_none implementation. Test uses the new parameter passing order from this patch.
The first 3 sub tests got performance improvement due to reduced register copy instructions from the preserve_none parsing functions. The last test is not impacted. I checked its profile, found there are more register spilling in parsing function FastSS1. Because there is no live ranges change, I think it's more like a register allocation problem. So the general result is very positive for protobuf parsing. |
Very promising. Can you update the discourse thread with the results and propose for the change in preserve_none? |
The patch looks good to me. Could you also explicitly document that preserve_none's ABI is still unstable, it may be changed again in future if we can find other improvement opportunity. |
Yep. I also updated the docs with the new register-passing order. |
The Windows x64 Buildkite CI job is failing, but I can't find any failed tests in the downloaded logs. Any advice? |
You can download the raw log file from the details view. From the log, it looks like not related to this change -- the flang build run out of heap space: C:\BuildTools\VC\Tools\MSVC\14.29.30133\include\type_traits(1432): fatal error C1060: compiler is out of heap space |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks good to me!
Thanks for the improvement.
Anyone available to merge this (now that it's been approved)? |
@brandtbucher Congratulations on having your first Pull Request (PR) merged into the LLVM Project! Your changes will be combined with recent changes from other authors, then tested Please check whether problems have been caused by your change specifically, as How to do this, and the rest of the post-merge process, is covered in detail here. If your change does cause a problem, it may be reverted, or you can revert it yourself. If you don't get any reports, no action is required from you. Your changes are working as expected, well done! |
I'm really happy to see this proposed and merged; this is how I originally envisioned it would work: haberman@e8d9c75 |
Consider the case that __attribute__((preserve_most)) void boring(void *);
__attribute__((preserve_none)) void (continuation)(void *, void *, void *, void *, void*);
__attribute__((preserve_none)) void entry(void *a, void *b, void *c, void *d, void* e)
{
boring(a);
__attribute__((musttail)) return continuation(a, b, c, d, e);
} It seems we could get even better results for this case (without compromising the regular case) if we rearranged the arguments a bit further. Original order (before this PR):
Current order (after this PR):
Possible improvements:
A
|
This uses non-volatile registers for the first four (six on Windows) registers used for
preserve_none
argument passing. This allows these registers to stay "pinned", even if the body of thepreserve_none
function contains calls to other "normal" functions.Example:
Before:
After: