-
Notifications
You must be signed in to change notification settings - Fork 13.4k
[windows] Always pass fp128 arguments indirectly #128848
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Thank you for submitting a Pull Request (PR) to the LLVM Project! This PR will be automatically labeled and the relevant teams will be notified. If you wish to, you can add reviewers by using the "Reviewers" section on this page. If this is not working for you, it is probably because you do not have write permissions for the repository. In which case you can instead tag reviewers by name in a comment by using If you have received no comments on your PR for a week, you can request a review by "ping"ing the PR by adding a comment “Ping”. The common courtesy "ping" rate is once a week. Please remember that you are asking for valuable time from other developers. If you have further questions, they may be answered by the LLVM GitHub User Guide. You can also ask questions in a comment on this PR, on the LLVM Discord or on the forums. |
@llvm/pr-subscribers-backend-x86 Author: Trevor Gross (tgross35) ChangesLLVM expects Update LLVM's libcall calling convention to pass With this change, the calling convention for This corresponds to the frontend change in 2. Patch is 121.54 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/128848.diff 3 Files Affected:
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
index cf164acba9ec0..0d087e057a2bd 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -639,8 +639,9 @@ def CC_X86_Win64_C : CallingConv<[
// 512 bit vectors are passed by pointer
CCIfType<[v64i8, v32i16, v16i32, v32f16, v32bf16, v16f32, v8f64, v8i64], CCPassIndirect<i64>>,
- // Long doubles are passed by pointer
+ // Float types larger than 64-bits (long double and fp128) are passed by pointer
CCIfType<[f80], CCPassIndirect<i64>>,
+ CCIfType<[f128], CCPassIndirect<i64>>,
// If SSE was disabled, pass FP values smaller than 64-bits as integers in
// GPRs or on the stack.
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
index 5263e0d4f6f39..a85b53ea62ac7 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
@@ -8,6 +8,12 @@
; RUN: llc < %s -O2 -mtriple=i686-linux-gnu -mattr=+sse2 \
; RUN: -enable-legalize-types-checking \
; RUN: | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -O2 -mtriple=x86_64-pc-windows-msvc \
+; RUN: -enable-legalize-types-checking \
+; RUN: | FileCheck %s --check-prefix=WIN
+; RUN: llc < %s -O2 -mtriple=i686-pc-windows-msvc \
+; RUN: -enable-legalize-types-checking \
+; RUN: | FileCheck %s --check-prefix=WIN-X86
; Check all soft floating point library function calls.
@@ -57,6 +63,55 @@ define fp128 @add(fp128 %x, fp128 %y) nounwind strictfp {
; X86-NEXT: addl $24, %esp
; X86-NEXT: popl %esi
; X86-NEXT: retl $4
+;
+; WIN-LABEL: add:
+; WIN: # %bb.0: # %entry
+; WIN-NEXT: subq $72, %rsp
+; WIN-NEXT: movaps (%rcx), %xmm0
+; WIN-NEXT: movaps (%rdx), %xmm1
+; WIN-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; WIN-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT: callq __addtf3
+; WIN-NEXT: addq $72, %rsp
+; WIN-NEXT: retq
+;
+; WIN-X86-LABEL: add:
+; WIN-X86: # %bb.0: # %entry
+; WIN-X86-NEXT: pushl %ebp
+; WIN-X86-NEXT: movl %esp, %ebp
+; WIN-X86-NEXT: pushl %edi
+; WIN-X86-NEXT: pushl %esi
+; WIN-X86-NEXT: andl $-16, %esp
+; WIN-X86-NEXT: subl $16, %esp
+; WIN-X86-NEXT: movl 8(%ebp), %esi
+; WIN-X86-NEXT: movl %esp, %eax
+; WIN-X86-NEXT: pushl 40(%ebp)
+; WIN-X86-NEXT: pushl 36(%ebp)
+; WIN-X86-NEXT: pushl 32(%ebp)
+; WIN-X86-NEXT: pushl 28(%ebp)
+; WIN-X86-NEXT: pushl 24(%ebp)
+; WIN-X86-NEXT: pushl 20(%ebp)
+; WIN-X86-NEXT: pushl 16(%ebp)
+; WIN-X86-NEXT: pushl 12(%ebp)
+; WIN-X86-NEXT: pushl %eax
+; WIN-X86-NEXT: calll ___addtf3
+; WIN-X86-NEXT: addl $36, %esp
+; WIN-X86-NEXT: movl (%esp), %eax
+; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; WIN-X86-NEXT: movl %edi, 8(%esi)
+; WIN-X86-NEXT: movl %edx, 12(%esi)
+; WIN-X86-NEXT: movl %eax, (%esi)
+; WIN-X86-NEXT: movl %ecx, 4(%esi)
+; WIN-X86-NEXT: movl %esi, %eax
+; WIN-X86-NEXT: leal -8(%ebp), %esp
+; WIN-X86-NEXT: popl %esi
+; WIN-X86-NEXT: popl %edi
+; WIN-X86-NEXT: popl %ebp
+; WIN-X86-NEXT: retl
entry:
%add = call fp128 @llvm.experimental.constrained.fadd.f128(fp128 %x, fp128 %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
ret fp128 %add
@@ -108,6 +163,55 @@ define fp128 @sub(fp128 %x, fp128 %y) nounwind strictfp {
; X86-NEXT: addl $24, %esp
; X86-NEXT: popl %esi
; X86-NEXT: retl $4
+;
+; WIN-LABEL: sub:
+; WIN: # %bb.0: # %entry
+; WIN-NEXT: subq $72, %rsp
+; WIN-NEXT: movaps (%rcx), %xmm0
+; WIN-NEXT: movaps (%rdx), %xmm1
+; WIN-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; WIN-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT: callq __subtf3
+; WIN-NEXT: addq $72, %rsp
+; WIN-NEXT: retq
+;
+; WIN-X86-LABEL: sub:
+; WIN-X86: # %bb.0: # %entry
+; WIN-X86-NEXT: pushl %ebp
+; WIN-X86-NEXT: movl %esp, %ebp
+; WIN-X86-NEXT: pushl %edi
+; WIN-X86-NEXT: pushl %esi
+; WIN-X86-NEXT: andl $-16, %esp
+; WIN-X86-NEXT: subl $16, %esp
+; WIN-X86-NEXT: movl 8(%ebp), %esi
+; WIN-X86-NEXT: movl %esp, %eax
+; WIN-X86-NEXT: pushl 40(%ebp)
+; WIN-X86-NEXT: pushl 36(%ebp)
+; WIN-X86-NEXT: pushl 32(%ebp)
+; WIN-X86-NEXT: pushl 28(%ebp)
+; WIN-X86-NEXT: pushl 24(%ebp)
+; WIN-X86-NEXT: pushl 20(%ebp)
+; WIN-X86-NEXT: pushl 16(%ebp)
+; WIN-X86-NEXT: pushl 12(%ebp)
+; WIN-X86-NEXT: pushl %eax
+; WIN-X86-NEXT: calll ___subtf3
+; WIN-X86-NEXT: addl $36, %esp
+; WIN-X86-NEXT: movl (%esp), %eax
+; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; WIN-X86-NEXT: movl %edi, 8(%esi)
+; WIN-X86-NEXT: movl %edx, 12(%esi)
+; WIN-X86-NEXT: movl %eax, (%esi)
+; WIN-X86-NEXT: movl %ecx, 4(%esi)
+; WIN-X86-NEXT: movl %esi, %eax
+; WIN-X86-NEXT: leal -8(%ebp), %esp
+; WIN-X86-NEXT: popl %esi
+; WIN-X86-NEXT: popl %edi
+; WIN-X86-NEXT: popl %ebp
+; WIN-X86-NEXT: retl
entry:
%sub = call fp128 @llvm.experimental.constrained.fsub.f128(fp128 %x, fp128 %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
ret fp128 %sub
@@ -159,6 +263,55 @@ define fp128 @mul(fp128 %x, fp128 %y) nounwind strictfp {
; X86-NEXT: addl $24, %esp
; X86-NEXT: popl %esi
; X86-NEXT: retl $4
+;
+; WIN-LABEL: mul:
+; WIN: # %bb.0: # %entry
+; WIN-NEXT: subq $72, %rsp
+; WIN-NEXT: movaps (%rcx), %xmm0
+; WIN-NEXT: movaps (%rdx), %xmm1
+; WIN-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; WIN-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT: callq __multf3
+; WIN-NEXT: addq $72, %rsp
+; WIN-NEXT: retq
+;
+; WIN-X86-LABEL: mul:
+; WIN-X86: # %bb.0: # %entry
+; WIN-X86-NEXT: pushl %ebp
+; WIN-X86-NEXT: movl %esp, %ebp
+; WIN-X86-NEXT: pushl %edi
+; WIN-X86-NEXT: pushl %esi
+; WIN-X86-NEXT: andl $-16, %esp
+; WIN-X86-NEXT: subl $16, %esp
+; WIN-X86-NEXT: movl 8(%ebp), %esi
+; WIN-X86-NEXT: movl %esp, %eax
+; WIN-X86-NEXT: pushl 40(%ebp)
+; WIN-X86-NEXT: pushl 36(%ebp)
+; WIN-X86-NEXT: pushl 32(%ebp)
+; WIN-X86-NEXT: pushl 28(%ebp)
+; WIN-X86-NEXT: pushl 24(%ebp)
+; WIN-X86-NEXT: pushl 20(%ebp)
+; WIN-X86-NEXT: pushl 16(%ebp)
+; WIN-X86-NEXT: pushl 12(%ebp)
+; WIN-X86-NEXT: pushl %eax
+; WIN-X86-NEXT: calll ___multf3
+; WIN-X86-NEXT: addl $36, %esp
+; WIN-X86-NEXT: movl (%esp), %eax
+; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; WIN-X86-NEXT: movl %edi, 8(%esi)
+; WIN-X86-NEXT: movl %edx, 12(%esi)
+; WIN-X86-NEXT: movl %eax, (%esi)
+; WIN-X86-NEXT: movl %ecx, 4(%esi)
+; WIN-X86-NEXT: movl %esi, %eax
+; WIN-X86-NEXT: leal -8(%ebp), %esp
+; WIN-X86-NEXT: popl %esi
+; WIN-X86-NEXT: popl %edi
+; WIN-X86-NEXT: popl %ebp
+; WIN-X86-NEXT: retl
entry:
%mul = call fp128 @llvm.experimental.constrained.fmul.f128(fp128 %x, fp128 %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
ret fp128 %mul
@@ -210,6 +363,55 @@ define fp128 @div(fp128 %x, fp128 %y) nounwind strictfp {
; X86-NEXT: addl $24, %esp
; X86-NEXT: popl %esi
; X86-NEXT: retl $4
+;
+; WIN-LABEL: div:
+; WIN: # %bb.0: # %entry
+; WIN-NEXT: subq $72, %rsp
+; WIN-NEXT: movaps (%rcx), %xmm0
+; WIN-NEXT: movaps (%rdx), %xmm1
+; WIN-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; WIN-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT: callq __divtf3
+; WIN-NEXT: addq $72, %rsp
+; WIN-NEXT: retq
+;
+; WIN-X86-LABEL: div:
+; WIN-X86: # %bb.0: # %entry
+; WIN-X86-NEXT: pushl %ebp
+; WIN-X86-NEXT: movl %esp, %ebp
+; WIN-X86-NEXT: pushl %edi
+; WIN-X86-NEXT: pushl %esi
+; WIN-X86-NEXT: andl $-16, %esp
+; WIN-X86-NEXT: subl $16, %esp
+; WIN-X86-NEXT: movl 8(%ebp), %esi
+; WIN-X86-NEXT: movl %esp, %eax
+; WIN-X86-NEXT: pushl 40(%ebp)
+; WIN-X86-NEXT: pushl 36(%ebp)
+; WIN-X86-NEXT: pushl 32(%ebp)
+; WIN-X86-NEXT: pushl 28(%ebp)
+; WIN-X86-NEXT: pushl 24(%ebp)
+; WIN-X86-NEXT: pushl 20(%ebp)
+; WIN-X86-NEXT: pushl 16(%ebp)
+; WIN-X86-NEXT: pushl 12(%ebp)
+; WIN-X86-NEXT: pushl %eax
+; WIN-X86-NEXT: calll ___divtf3
+; WIN-X86-NEXT: addl $36, %esp
+; WIN-X86-NEXT: movl (%esp), %eax
+; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; WIN-X86-NEXT: movl %edi, 8(%esi)
+; WIN-X86-NEXT: movl %edx, 12(%esi)
+; WIN-X86-NEXT: movl %eax, (%esi)
+; WIN-X86-NEXT: movl %ecx, 4(%esi)
+; WIN-X86-NEXT: movl %esi, %eax
+; WIN-X86-NEXT: leal -8(%ebp), %esp
+; WIN-X86-NEXT: popl %esi
+; WIN-X86-NEXT: popl %edi
+; WIN-X86-NEXT: popl %ebp
+; WIN-X86-NEXT: retl
entry:
%div = call fp128 @llvm.experimental.constrained.fdiv.f128(fp128 %x, fp128 %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
ret fp128 %div
@@ -258,6 +460,62 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp {
; X86-NEXT: addl $24, %esp
; X86-NEXT: popl %esi
; X86-NEXT: retl $4
+;
+; WIN-LABEL: fma:
+; WIN: # %bb.0: # %entry
+; WIN-NEXT: subq $88, %rsp
+; WIN-NEXT: movaps (%r8), %xmm0
+; WIN-NEXT: movaps (%rcx), %xmm1
+; WIN-NEXT: movaps (%rdx), %xmm2
+; WIN-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; WIN-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; WIN-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT: leaq {{[0-9]+}}(%rsp), %r8
+; WIN-NEXT: callq fmal
+; WIN-NEXT: addq $88, %rsp
+; WIN-NEXT: retq
+;
+; WIN-X86-LABEL: fma:
+; WIN-X86: # %bb.0: # %entry
+; WIN-X86-NEXT: pushl %ebp
+; WIN-X86-NEXT: movl %esp, %ebp
+; WIN-X86-NEXT: pushl %edi
+; WIN-X86-NEXT: pushl %esi
+; WIN-X86-NEXT: andl $-16, %esp
+; WIN-X86-NEXT: subl $16, %esp
+; WIN-X86-NEXT: movl 8(%ebp), %esi
+; WIN-X86-NEXT: movl %esp, %eax
+; WIN-X86-NEXT: pushl 56(%ebp)
+; WIN-X86-NEXT: pushl 52(%ebp)
+; WIN-X86-NEXT: pushl 48(%ebp)
+; WIN-X86-NEXT: pushl 44(%ebp)
+; WIN-X86-NEXT: pushl 40(%ebp)
+; WIN-X86-NEXT: pushl 36(%ebp)
+; WIN-X86-NEXT: pushl 32(%ebp)
+; WIN-X86-NEXT: pushl 28(%ebp)
+; WIN-X86-NEXT: pushl 24(%ebp)
+; WIN-X86-NEXT: pushl 20(%ebp)
+; WIN-X86-NEXT: pushl 16(%ebp)
+; WIN-X86-NEXT: pushl 12(%ebp)
+; WIN-X86-NEXT: pushl %eax
+; WIN-X86-NEXT: calll _fmal
+; WIN-X86-NEXT: addl $52, %esp
+; WIN-X86-NEXT: movl (%esp), %eax
+; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; WIN-X86-NEXT: movl %edi, 8(%esi)
+; WIN-X86-NEXT: movl %edx, 12(%esi)
+; WIN-X86-NEXT: movl %eax, (%esi)
+; WIN-X86-NEXT: movl %ecx, 4(%esi)
+; WIN-X86-NEXT: movl %esi, %eax
+; WIN-X86-NEXT: leal -8(%ebp), %esp
+; WIN-X86-NEXT: popl %esi
+; WIN-X86-NEXT: popl %edi
+; WIN-X86-NEXT: popl %ebp
+; WIN-X86-NEXT: retl
entry:
%fma = call fp128 @llvm.experimental.constrained.fma.f128(fp128 %x, fp128 %y, fp128 %z, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
ret fp128 %fma
@@ -302,6 +560,55 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp {
; X86-NEXT: addl $24, %esp
; X86-NEXT: popl %esi
; X86-NEXT: retl $4
+;
+; WIN-LABEL: frem:
+; WIN: # %bb.0: # %entry
+; WIN-NEXT: subq $72, %rsp
+; WIN-NEXT: movaps (%rcx), %xmm0
+; WIN-NEXT: movaps (%rdx), %xmm1
+; WIN-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; WIN-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT: callq fmodl
+; WIN-NEXT: addq $72, %rsp
+; WIN-NEXT: retq
+;
+; WIN-X86-LABEL: frem:
+; WIN-X86: # %bb.0: # %entry
+; WIN-X86-NEXT: pushl %ebp
+; WIN-X86-NEXT: movl %esp, %ebp
+; WIN-X86-NEXT: pushl %edi
+; WIN-X86-NEXT: pushl %esi
+; WIN-X86-NEXT: andl $-16, %esp
+; WIN-X86-NEXT: subl $16, %esp
+; WIN-X86-NEXT: movl 8(%ebp), %esi
+; WIN-X86-NEXT: movl %esp, %eax
+; WIN-X86-NEXT: pushl 40(%ebp)
+; WIN-X86-NEXT: pushl 36(%ebp)
+; WIN-X86-NEXT: pushl 32(%ebp)
+; WIN-X86-NEXT: pushl 28(%ebp)
+; WIN-X86-NEXT: pushl 24(%ebp)
+; WIN-X86-NEXT: pushl 20(%ebp)
+; WIN-X86-NEXT: pushl 16(%ebp)
+; WIN-X86-NEXT: pushl 12(%ebp)
+; WIN-X86-NEXT: pushl %eax
+; WIN-X86-NEXT: calll _fmodl
+; WIN-X86-NEXT: addl $36, %esp
+; WIN-X86-NEXT: movl (%esp), %eax
+; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; WIN-X86-NEXT: movl %edi, 8(%esi)
+; WIN-X86-NEXT: movl %edx, 12(%esi)
+; WIN-X86-NEXT: movl %eax, (%esi)
+; WIN-X86-NEXT: movl %ecx, 4(%esi)
+; WIN-X86-NEXT: movl %esi, %eax
+; WIN-X86-NEXT: leal -8(%ebp), %esp
+; WIN-X86-NEXT: popl %esi
+; WIN-X86-NEXT: popl %edi
+; WIN-X86-NEXT: popl %ebp
+; WIN-X86-NEXT: retl
entry:
%div = call fp128 @llvm.experimental.constrained.frem.f128(fp128 %x, fp128 %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
ret fp128 %div
@@ -342,6 +649,48 @@ define fp128 @ceil(fp128 %x) nounwind strictfp {
; X86-NEXT: addl $24, %esp
; X86-NEXT: popl %esi
; X86-NEXT: retl $4
+;
+; WIN-LABEL: ceil:
+; WIN: # %bb.0: # %entry
+; WIN-NEXT: subq $56, %rsp
+; WIN-NEXT: movaps (%rcx), %xmm0
+; WIN-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT: callq ceill
+; WIN-NEXT: addq $56, %rsp
+; WIN-NEXT: retq
+;
+; WIN-X86-LABEL: ceil:
+; WIN-X86: # %bb.0: # %entry
+; WIN-X86-NEXT: pushl %ebp
+; WIN-X86-NEXT: movl %esp, %ebp
+; WIN-X86-NEXT: pushl %edi
+; WIN-X86-NEXT: pushl %esi
+; WIN-X86-NEXT: andl $-16, %esp
+; WIN-X86-NEXT: subl $16, %esp
+; WIN-X86-NEXT: movl 8(%ebp), %esi
+; WIN-X86-NEXT: movl %esp, %eax
+; WIN-X86-NEXT: pushl 24(%ebp)
+; WIN-X86-NEXT: pushl 20(%ebp)
+; WIN-X86-NEXT: pushl 16(%ebp)
+; WIN-X86-NEXT: pushl 12(%ebp)
+; WIN-X86-NEXT: pushl %eax
+; WIN-X86-NEXT: calll _ceill
+; WIN-X86-NEXT: addl $20, %esp
+; WIN-X86-NEXT: movl (%esp), %eax
+; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; WIN-X86-NEXT: movl %edi, 8(%esi)
+; WIN-X86-NEXT: movl %edx, 12(%esi)
+; WIN-X86-NEXT: movl %eax, (%esi)
+; WIN-X86-NEXT: movl %ecx, 4(%esi)
+; WIN-X86-NEXT: movl %esi, %eax
+; WIN-X86-NEXT: leal -8(%ebp), %esp
+; WIN-X86-NEXT: popl %esi
+; WIN-X86-NEXT: popl %edi
+; WIN-X86-NEXT: popl %ebp
+; WIN-X86-NEXT: retl
entry:
%ceil = call fp128 @llvm.experimental.constrained.ceil.f128(fp128 %x, metadata !"fpexcept.strict") #0
ret fp128 %ceil
@@ -382,6 +731,48 @@ define fp128 @acos(fp128 %x) nounwind strictfp {
; X86-NEXT: addl $24, %esp
; X86-NEXT: popl %esi
; X86-NEXT: retl $4
+;
+; WIN-LABEL: acos:
+; WIN: # %bb.0: # %entry
+; WIN-NEXT: subq $56, %rsp
+; WIN-NEXT: movaps (%rcx), %xmm0
+; WIN-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT: callq acosl
+; WIN-NEXT: addq $56, %rsp
+; WIN-NEXT: retq
+;
+; WIN-X86-LABEL: acos:
+; WIN-X86: # %bb.0: # %entry
+; WIN-X86-NEXT: pushl %ebp
+; WIN-X86-NEXT: movl %esp, %ebp
+; WIN-X86-NEXT: pushl %edi
+; WIN-X86-NEXT: pushl %esi
+; WIN-X86-NEXT: andl $-16, %esp
+; WIN-X86-NEXT: subl $16, %esp
+; WIN-X86-NEXT: movl 8(%ebp), %esi
+; WIN-X86-NEXT: movl %esp, %eax
+; WIN-X86-NEXT: pushl 24(%ebp)
+; WIN-X86-NEXT: pushl 20(%ebp)
+; WIN-X86-NEXT: pushl 16(%ebp)
+; WIN-X86-NEXT: pushl 12(%ebp)
+; WIN-X86-NEXT: pushl %eax
+; WIN-X86-NEXT: calll _acosl
+; WIN-X86-NEXT: addl $20, %esp
+; WIN-X86-NEXT: movl (%esp), %eax
+; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; WIN-X86-NEXT: movl %edi, 8(%esi)
+; WIN-X86-NEXT: movl %edx, 12(%esi)
+; WIN-X86-NEXT: movl %eax, (%esi)
+; WIN-X86-NEXT: movl %ecx, 4(%esi)
+; WIN-X86-NEXT: movl %esi, %eax
+; WIN-X86-NEXT: leal -8(%ebp), %esp
+; WIN-X86-NEXT: popl %esi
+; WIN-X86-NEXT: popl %edi
+; WIN-X86-NEXT: popl %ebp
+; WIN-X86-NEXT: retl
entry:
%acos = call fp128 @llvm.experimental.constrained.acos.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
ret fp128 %acos
@@ -422,6 +813,48 @@ define fp128 @cos(fp128 %x) nounwind strictfp {
; X86-NEXT: addl $24, %esp
; X86-NEXT: popl %esi
; X86-NEXT: retl $4
+;
+; WIN-LABEL: cos:
+; WIN: # %bb.0: # %entry
+; WIN-NEXT: subq $56, %rsp
+; WIN-NEXT: movaps (%rcx), %xmm0
+; WIN-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT: callq cosl
+; WIN-NEXT: addq $56, %rsp
+; WIN-NEXT: retq
+;
+; WIN-X86-LABEL: cos:
+; WIN-X86: # %bb.0: # %entry
+; WIN-X86-NEXT: pushl %ebp
+; WIN-X86-NEXT: movl %esp, %ebp
+; WIN-X86-NEXT: pushl %edi
+; WIN-X86-NEXT: pushl %esi
+; WIN-X86-NEXT: andl $-16, %esp
+; WIN-X86-NEXT: subl $16, %esp
+; WIN-X86-NEXT: movl 8(%ebp), %esi
+; WIN-X86-NEXT: movl %esp, %eax
+; WIN-X86-NEXT: pushl 24(%ebp)
+; WIN-X86-NEXT: pushl 20(%ebp)
+; WIN-X86-NEXT: pushl 16(%ebp)
+; WIN-X86-NEXT: pushl 12(%ebp)
+; WIN-X86-NEXT: pushl %eax
+; WIN-X86-NEXT: calll _cosl
+; WIN-X86-NEXT: addl $20, %esp
+; WIN-X86-NEXT: movl (%esp), %eax
+; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; WIN-X86-NEXT: movl %edi, 8(%esi)
+; WIN-X86-NEXT: movl %edx, 12(%esi)
+; WIN-X86-NEXT: movl %eax, (%esi)
+; WIN-X86-NEXT: movl %ecx, 4(%esi)
+; WIN-X86-NEXT: movl %esi, %eax
+; WIN-X86-NEXT: leal -8(%ebp), %esp
+; WIN-X86-NEXT: popl %esi
+; WIN-X86-NEXT: popl %edi
+; WIN-X86-NEXT: popl %ebp
+; WIN-X86-NEXT: retl
entry:
%cos = call fp128 @llvm.experimental.constrained.cos.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
ret fp128 %cos
@@ -462,6 +895,48 @@ define fp128 @cosh(fp128 %x) nounwind strictfp {
; X86-NEXT: addl $24, %esp
; X86-NEXT: popl %esi
; X86-NEXT: retl $4
+;
+; WIN-LABEL: cosh:
+; WIN: # %bb.0: # %entry
+; WIN-NEXT: subq $56, %rsp
+; WIN-NEXT: movaps (%rcx), %xmm0
+; WIN-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT: callq coshl
+; WIN-NEXT: addq $56, %rsp
+; WIN-NEXT: retq
+;
+; WIN-X86-LABEL: cosh:
+; WIN-X86: # %bb.0: # %entry
+; WIN-X86-NEXT: pushl %ebp
+; WIN-X86-NEXT: movl %esp, %ebp
+; WIN-X86-NEXT: pushl %edi
+; WIN-X86-NEXT: pushl %esi
+; WIN-X86-NEXT: andl $-16, %esp
+; WIN-X86-NEXT: subl $16, %esp
+; WIN-X86-NEXT: movl 8(%ebp), %esi
+; WIN-X86-NEXT: movl %esp, %eax
+; WIN-X86-NEXT: pushl 24(%ebp)
+; WIN-X86-NEXT: pushl 20(%ebp)
+; WIN-X86-NEXT: pushl 16(%ebp)
+; WIN-X86-NEXT: pushl 12(%ebp)
+; WIN-X86-NEXT: pushl %eax
+; WIN-X86-NEXT: calll _coshl
+; WIN-X86-NEXT: addl $20, %esp
+; WIN-X86-NEXT: movl (%esp), %eax
+; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; WIN-X86-NEXT: movl %edi, 8(%esi)
+; WIN-X86-NEXT: movl %edx, 12(%esi)
+; WIN-X86-NEXT: movl %eax, (%esi)
+; WIN-X86-NEXT: movl %ecx, 4(%esi)
+; WIN-X86-NEXT: movl %esi, %eax
+; WIN-X86-NEXT: leal -8(%ebp), %esp
+; WIN-X86-NEXT: popl %esi
+; WIN-X86-NEXT: popl %edi
+; WIN-X86-NEXT: popl %ebp
+; WIN-X86-NEXT: retl
entry:
%cosh = call fp128 @llvm.experim...
[truncated]
|
f6d8e8b
to
eea0a28
Compare
@rnk in #115052 (comment) you mentioned that changes in |
I went looking at |
f57b8a2
to
71e42dc
Compare
71e42dc
to
b6c7942
Compare
…, r=<try> [do not merge] Windows f128 abi experiment Running tests with llvm/llvm-project#115052 and llvm/llvm-project#128848. r? `@ghost` try-job: dist-x86_64-msvc try-job: dist-x86_64-mingw try-job: x86_64-msvc-1 try-job: x86_64-msvc-2 try-job: x86_64-mingw-1 try-job: x86_64-mingw-2 try-job: x86_64-mingw-3
…, r=<try> [do not merge] Windows f128 abi experiment Running tests with llvm/llvm-project#115052 and llvm/llvm-project#128848. r? `@ghost` try-job: dist-x86_64-msvc try-job: dist-x86_64-mingw try-job: x86_64-msvc-1 try-job: x86_64-msvc-2 try-job: x86_64-mingw-1 try-job: x86_64-mingw-2
Thanks for looking into that. I think this should be all set then, along with #115052 |
…, r=<try> [do not merge] Windows f128 abi experiment Running tests with llvm/llvm-project#115052 and llvm/llvm-project#128848. r? `@ghost` try-job: dist-x86_64-msvc try-job: dist-x86_64-mingw try-job: x86_64-msvc-1 try-job: x86_64-msvc-2 try-job: x86_64-mingw-1 try-job: x86_64-mingw-2
Thanks for reviewing, I need somebody to land this for me. First commit for both PRs is NFC |
Clang currently passes and returns `__float128` in vector registers on MinGW targets, which is LLVM's default ABI for `fp128`. However, the Windows x86-64 calling convention [1] states the following: __m128 types, arrays, and strings are never passed by immediate value. Instead, a pointer is passed to memory allocated by the caller. Structs and unions of size 8, 16, 32, or 64 bits, and __m64 types, are passed as if they were integers of the same size. Structs or unions of other sizes are passed as a pointer to memory allocated by the caller. For these aggregate types passed as a pointer, including __m128, the caller-allocated temporary memory must be 16-byte aligned. Based on the above it sounds like `__float128` should be passed indirectly. Thus, change `f128` passing to use the stack and make the return in xmm0 explicit. This is the identical to `i128`, and passing is the same as GCC. Regarding return values, the documentation states: A scalar return value that can fit into 64 bits, including the __m64 type, is returned through RAX. Non-scalar types including floats, doubles, and vector types such as __m128, __m128i, __m128d are returned in XMM0. This makes it sound like it should be acceptable to return `__float128` in xmm0; however, GCC returns `__float128` on the stack. That above ABI statement as well as consistency with `i128` (which is returned in xmm0) mean that it would likely be better for GCC to change its return ABI to match Clang rather than the other way around, so that portion is left as-is. Clang's MSVC targets do not support `__float128` or `_Float128`, but these changes would also apply there if it is eventually enabled. With [2] which should land around the same time, LLVM will also implement this ABI so it is not technically necessary for Clang to make a change here as well. This is sill done in order to be consistent with other types, and to allow calling convention-aware optimizations at all available optimization layers (@rnk mentioned possible reuse of stack arguments). An added benefit is readibility of the LLVM IR since it more accurately reflects what the lowered assembly does. [1]: https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170 [2]: llvm#128848
Clang currently passes and returns `__float128` in vector registers on MinGW targets, which is LLVM's default ABI for `fp128`. However, the Windows x86-64 calling convention [1] states the following: __m128 types, arrays, and strings are never passed by immediate value. Instead, a pointer is passed to memory allocated by the caller. Structs and unions of size 8, 16, 32, or 64 bits, and __m64 types, are passed as if they were integers of the same size. Structs or unions of other sizes are passed as a pointer to memory allocated by the caller. For these aggregate types passed as a pointer, including __m128, the caller-allocated temporary memory must be 16-byte aligned. Based on the above it sounds like `__float128` should be passed indirectly. Thus, change `f128` passing to use the stack and make the return in xmm0 explicit. This is the identical to `i128`, and passing is the same as GCC. Regarding return values, the documentation states: A scalar return value that can fit into 64 bits, including the __m64 type, is returned through RAX. Non-scalar types including floats, doubles, and vector types such as __m128, __m128i, __m128d are returned in XMM0. This makes it sound like it should be acceptable to return `__float128` in xmm0; however, GCC returns `__float128` on the stack. That above ABI statement as well as consistency with `i128` (which is returned in xmm0) mean that it would likely be better for GCC to change its return ABI to match Clang rather than the other way around, so that portion is left as-is. Clang's MSVC targets do not support `__float128` or `_Float128`, but these changes would also apply there if it is eventually enabled. With [2] which should land around the same time, LLVM will also implement this ABI so it is not technically necessary for Clang to make a change here as well. This is sill done in order to be consistent with other types, and to allow calling convention-aware optimizations at all available optimization layers (rnk mentioned possible reuse of stack arguments). An added benefit is readibility of the LLVM IR since it more accurately reflects what the lowered assembly does. [1]: https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170 [2]: llvm#128848
Clang currently passes and returns `__float128` in vector registers on MinGW targets, which is LLVM's default ABI for `fp128`. However, the Windows x86-64 calling convention [1] states the following: __m128 types, arrays, and strings are never passed by immediate value. Instead, a pointer is passed to memory allocated by the caller. Structs and unions of size 8, 16, 32, or 64 bits, and __m64 types, are passed as if they were integers of the same size. Structs or unions of other sizes are passed as a pointer to memory allocated by the caller. For these aggregate types passed as a pointer, including __m128, the caller-allocated temporary memory must be 16-byte aligned. Based on the above it sounds like `__float128` should be passed indirectly. Thus, change `f128` passing to use the stack and make the return in xmm0 explicit. This is the identical to `i128`, and passing is the same as GCC. Regarding return values, the documentation states: A scalar return value that can fit into 64 bits, including the __m64 type, is returned through RAX. Non-scalar types including floats, doubles, and vector types such as __m128, __m128i, __m128d are returned in XMM0. This makes it sound like it should be acceptable to return `__float128` in xmm0; however, GCC returns `__float128` on the stack. That above ABI statement as well as consistency with `i128` (which is returned in xmm0) mean that it would likely be better for GCC to change its return ABI to match Clang rather than the other way around, so that portion is left as-is. Clang's MSVC targets do not support `__float128` or `_Float128`, but these changes would also apply there if it is eventually enabled. With [2] which should land around the same time, LLVM will also implement this ABI so it is not technically necessary for Clang to make a change here as well. This is sill done in order to be consistent with other types, and to allow calling convention-aware optimizations at all available optimization layers (@rnk mentioned possible reuse of stack arguments). An added benefit is readibility of the LLVM IR since it more accurately reflects what the lowered assembly does. [1]: https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170 [2]: #128848
…s (#115052) Clang currently passes and returns `__float128` in vector registers on MinGW targets, which is LLVM's default ABI for `fp128`. However, the Windows x86-64 calling convention [1] states the following: __m128 types, arrays, and strings are never passed by immediate value. Instead, a pointer is passed to memory allocated by the caller. Structs and unions of size 8, 16, 32, or 64 bits, and __m64 types, are passed as if they were integers of the same size. Structs or unions of other sizes are passed as a pointer to memory allocated by the caller. For these aggregate types passed as a pointer, including __m128, the caller-allocated temporary memory must be 16-byte aligned. Based on the above it sounds like `__float128` should be passed indirectly. Thus, change `f128` passing to use the stack and make the return in xmm0 explicit. This is the identical to `i128`, and passing is the same as GCC. Regarding return values, the documentation states: A scalar return value that can fit into 64 bits, including the __m64 type, is returned through RAX. Non-scalar types including floats, doubles, and vector types such as __m128, __m128i, __m128d are returned in XMM0. This makes it sound like it should be acceptable to return `__float128` in xmm0; however, GCC returns `__float128` on the stack. That above ABI statement as well as consistency with `i128` (which is returned in xmm0) mean that it would likely be better for GCC to change its return ABI to match Clang rather than the other way around, so that portion is left as-is. Clang's MSVC targets do not support `__float128` or `_Float128`, but these changes would also apply there if it is eventually enabled. With [2] which should land around the same time, LLVM will also implement this ABI so it is not technically necessary for Clang to make a change here as well. This is sill done in order to be consistent with other types, and to allow calling convention-aware optimizations at all available optimization layers (@rnk mentioned possible reuse of stack arguments). An added benefit is readibility of the LLVM IR since it more accurately reflects what the lowered assembly does. [1]: https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170 [2]: llvm/llvm-project#128848
@phoebewang would you mind getting this one too? The first commit is NFC and should land separate |
I see some problems here:
|
Add Windows invocations to existing fp128 libcall tests and create a new ABI test for x86.
b6c7942
to
5187cfd
Compare
LLVM currently expects `__float128` to be both passed and returned in xmm registers on Windows. However, this disagrees with the Windows x86-64 calling convention [1], which indicates values larger than 64 bits should be passed indirectly. Update LLVM's default Windows calling convention to pass `fp128` directly. Returning in xmm0 is unchanged since this seems like a reasonable extrapolation of the ABI. With this patch, the calling convention for `i128` and `f128` is the same. GCC passes `__float128` indirectly, which this also matches. However, it also returns indirectly, which is not done here. I intend to attempt a GCC change to also return in `xmm0` rather than making that change here, given the consistency with `i128`. This corresponds to the frontend change in [2], see more details there. [1]: https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170 [2]: llvm#115052
5187cfd
to
e0a3738
Compare
This is Windows-only. Were you referring to "Update LLVM's default calling convention" in the description? I meant this to refer to the default Windows CC used for libcalls, updated this to be more clear. The sysv ABI actually specifies __float128 as SSE+SSEUP and LLVM already does the right thing there.
That test only runs Linux, should I add the Windows targets? Some builtins calls are covered in
Done. Also rebased so this is on top of #115052. |
No, I just read the patch as it affects both Linux and Windows. It looks good to me if it's Windows only. |
@tgross35 Congratulations on having your first Pull Request (PR) merged into the LLVM Project! Your changes will be combined with recent changes from other authors, then tested by our build bots. If there is a problem with a build, you may receive a report in an email or a comment on this PR. Please check whether problems have been caused by your change specifically, as the builds can include changes from many authors. It is not uncommon for your change to be included in a build that fails due to someone else's changes, or infrastructure issues. How to do this, and the rest of the post-merge process, is covered in detail here. If your change does cause a problem, it may be reverted, or you can revert it yourself. This is a normal part of LLVM development. You can fix your changes and open a new PR to merge them again. If you don't get any reports, no action is required from you. Your changes are working as expected, well done! |
) Clang currently passes and returns `__float128` in vector registers on MinGW targets, which is LLVM's default ABI for `fp128`. However, the Windows x86-64 calling convention [1] states the following: __m128 types, arrays, and strings are never passed by immediate value. Instead, a pointer is passed to memory allocated by the caller. Structs and unions of size 8, 16, 32, or 64 bits, and __m64 types, are passed as if they were integers of the same size. Structs or unions of other sizes are passed as a pointer to memory allocated by the caller. For these aggregate types passed as a pointer, including __m128, the caller-allocated temporary memory must be 16-byte aligned. Based on the above it sounds like `__float128` should be passed indirectly. Thus, change `f128` passing to use the stack and make the return in xmm0 explicit. This is the identical to `i128`, and passing is the same as GCC. Regarding return values, the documentation states: A scalar return value that can fit into 64 bits, including the __m64 type, is returned through RAX. Non-scalar types including floats, doubles, and vector types such as __m128, __m128i, __m128d are returned in XMM0. This makes it sound like it should be acceptable to return `__float128` in xmm0; however, GCC returns `__float128` on the stack. That above ABI statement as well as consistency with `i128` (which is returned in xmm0) mean that it would likely be better for GCC to change its return ABI to match Clang rather than the other way around, so that portion is left as-is. Clang's MSVC targets do not support `__float128` or `_Float128`, but these changes would also apply there if it is eventually enabled. With [2] which should land around the same time, LLVM will also implement this ABI so it is not technically necessary for Clang to make a change here as well. This is sill done in order to be consistent with other types, and to allow calling convention-aware optimizations at all available optimization layers (@rnk mentioned possible reuse of stack arguments). An added benefit is readibility of the LLVM IR since it more accurately reflects what the lowered assembly does. [1]: https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170 [2]: llvm#128848
LLVM currently expects `__float128` to be both passed and returned in xmm registers on Windows. However, this disagrees with the Windows x86-64 calling convention [1], which indicates values larger than 64 bits should be passed indirectly. Update LLVM's default Windows calling convention to pass `fp128` directly. Returning in xmm0 is unchanged since this seems like a reasonable extrapolation of the ABI. With this patch, the calling convention for `i128` and `f128` is the same. GCC passes `__float128` indirectly, which this also matches. However, it also returns indirectly, which is not done here. I intend to attempt a GCC change to also return in `xmm0` rather than making that change here, given the consistency with `i128`. This corresponds to the frontend change in [2], see more details there. [1]: https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170 [2]: llvm#115052
LLVM currently expects
__float128
to be both passed and returned inxmm registers on Windows. However, this disagrees with the Windows
x86-64 calling convention 1, which indicates values larger than 64
bits should be passed indirectly.
Update LLVM's default Windows calling convention to pass
fp128
directly. Returning in xmm0 is unchanged since this seems like a
reasonable extrapolation of the ABI. With this patch, the calling
convention for
i128
andf128
is the same.GCC passes
__float128
indirectly, which this also matches. However, italso returns indirectly, which is not done here. I intend to attempt a
GCC change to also return in
xmm0
rather than making that change here,given the consistency with
i128
.This corresponds to the frontend change in 2, see more details there.