[windows] Always pass fp128 arguments indirectly #128848

tgross35 · 2025-02-26T09:34:49Z

LLVM currently expects __float128 to be both passed and returned in
xmm registers on Windows. However, this disagrees with the Windows
x86-64 calling convention 1, which indicates values larger than 64
bits should be passed indirectly.

Update LLVM's default Windows calling convention to pass fp128
directly. Returning in xmm0 is unchanged since this seems like a
reasonable extrapolation of the ABI. With this patch, the calling
convention for i128 and f128 is the same.

GCC passes __float128 indirectly, which this also matches. However, it
also returns indirectly, which is not done here. I intend to attempt a
GCC change to also return in xmm0 rather than making that change here,
given the consistency with i128.

This corresponds to the frontend change in 2, see more details there.

github-actions · 2025-02-26T09:35:10Z

Thank you for submitting a Pull Request (PR) to the LLVM Project!

This PR will be automatically labeled and the relevant teams will be notified.

If you wish to, you can add reviewers by using the "Reviewers" section on this page.

If this is not working for you, it is probably because you do not have write permissions for the repository. In which case you can instead tag reviewers by name in a comment by using @ followed by their GitHub username.

If you have received no comments on your PR for a week, you can request a review by "ping"ing the PR by adding a comment “Ping”. The common courtesy "ping" rate is once a week. Please remember that you are asking for valuable time from other developers.

If you have further questions, they may be answered by the LLVM GitHub User Guide.

You can also ask questions in a comment on this PR, on the LLVM Discord or on the forums.

llvmbot · 2025-02-26T09:35:46Z

@llvm/pr-subscribers-backend-x86

Author: Trevor Gross (tgross35)

Changes

LLVM expects __float128 to be both passed and returned in xmm
registers on Windows. However, this disagrees with the Windowx x86-64
calling convention 1, which indicates values larger than 64 bits
should be passed indirectly.

Update LLVM's libcall calling convention to pass fp128 directly.
Returning in xmm0 is unchanged since this seems like a reasonable
extrapolation of the ABI.

With this change, the calling convention for i128 and f128 becomes
the same.

This corresponds to the frontend change in 2.

Patch is 121.54 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/128848.diff

3 Files Affected:

(modified) llvm/lib/Target/X86/X86CallingConv.td (+2-1)
(modified) llvm/test/CodeGen/X86/fp128-libcalls-strict.ll (+1814)
(modified) llvm/test/CodeGen/X86/fp128-libcalls.ll (+1214)

diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
index cf164acba9ec0..0d087e057a2bd 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -639,8 +639,9 @@ def CC_X86_Win64_C : CallingConv<[
   // 512 bit vectors are passed by pointer
   CCIfType<[v64i8, v32i16, v16i32, v32f16, v32bf16, v16f32, v8f64, v8i64], CCPassIndirect<i64>>,
 
-  // Long doubles are passed by pointer
+  // Float types larger than 64-bits (long double and fp128) are passed by pointer
   CCIfType<[f80], CCPassIndirect<i64>>,
+  CCIfType<[f128], CCPassIndirect<i64>>,
 
   // If SSE was disabled, pass FP values smaller than 64-bits as integers in
   // GPRs or on the stack.
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
index 5263e0d4f6f39..a85b53ea62ac7 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
@@ -8,6 +8,12 @@
 ; RUN: llc < %s -O2 -mtriple=i686-linux-gnu -mattr=+sse2 \
 ; RUN:     -enable-legalize-types-checking \
 ; RUN:     | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -O2 -mtriple=x86_64-pc-windows-msvc \
+; RUN:     -enable-legalize-types-checking \
+; RUN:     | FileCheck %s --check-prefix=WIN
+; RUN: llc < %s -O2 -mtriple=i686-pc-windows-msvc \
+; RUN:     -enable-legalize-types-checking \
+; RUN:     | FileCheck %s --check-prefix=WIN-X86
 
 ; Check all soft floating point library function calls.
 
@@ -57,6 +63,55 @@ define fp128 @add(fp128 %x, fp128 %y) nounwind strictfp {
 ; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl $4
+;
+; WIN-LABEL: add:
+; WIN:       # %bb.0: # %entry
+; WIN-NEXT:    subq $72, %rsp
+; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    movaps (%rdx), %xmm1
+; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    callq __addtf3
+; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    retq
+;
+; WIN-X86-LABEL: add:
+; WIN-X86:       # %bb.0: # %entry
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %edi
+; WIN-X86-NEXT:    pushl %esi
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esp, %eax
+; WIN-X86-NEXT:    pushl 40(%ebp)
+; WIN-X86-NEXT:    pushl 36(%ebp)
+; WIN-X86-NEXT:    pushl 32(%ebp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    calll ___addtf3
+; WIN-X86-NEXT:    addl $36, %esp
+; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN-X86-NEXT:    movl %edi, 8(%esi)
+; WIN-X86-NEXT:    movl %edx, 12(%esi)
+; WIN-X86-NEXT:    movl %eax, (%esi)
+; WIN-X86-NEXT:    movl %ecx, 4(%esi)
+; WIN-X86-NEXT:    movl %esi, %eax
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebp
+; WIN-X86-NEXT:    retl
 entry:
   %add = call fp128 @llvm.experimental.constrained.fadd.f128(fp128 %x, fp128 %y,  metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %add
@@ -108,6 +163,55 @@ define fp128 @sub(fp128 %x, fp128 %y) nounwind strictfp {
 ; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl $4
+;
+; WIN-LABEL: sub:
+; WIN:       # %bb.0: # %entry
+; WIN-NEXT:    subq $72, %rsp
+; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    movaps (%rdx), %xmm1
+; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    callq __subtf3
+; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    retq
+;
+; WIN-X86-LABEL: sub:
+; WIN-X86:       # %bb.0: # %entry
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %edi
+; WIN-X86-NEXT:    pushl %esi
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esp, %eax
+; WIN-X86-NEXT:    pushl 40(%ebp)
+; WIN-X86-NEXT:    pushl 36(%ebp)
+; WIN-X86-NEXT:    pushl 32(%ebp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    calll ___subtf3
+; WIN-X86-NEXT:    addl $36, %esp
+; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN-X86-NEXT:    movl %edi, 8(%esi)
+; WIN-X86-NEXT:    movl %edx, 12(%esi)
+; WIN-X86-NEXT:    movl %eax, (%esi)
+; WIN-X86-NEXT:    movl %ecx, 4(%esi)
+; WIN-X86-NEXT:    movl %esi, %eax
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebp
+; WIN-X86-NEXT:    retl
 entry:
   %sub = call fp128 @llvm.experimental.constrained.fsub.f128(fp128 %x, fp128 %y,  metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %sub
@@ -159,6 +263,55 @@ define fp128 @mul(fp128 %x, fp128 %y) nounwind strictfp {
 ; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl $4
+;
+; WIN-LABEL: mul:
+; WIN:       # %bb.0: # %entry
+; WIN-NEXT:    subq $72, %rsp
+; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    movaps (%rdx), %xmm1
+; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    callq __multf3
+; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    retq
+;
+; WIN-X86-LABEL: mul:
+; WIN-X86:       # %bb.0: # %entry
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %edi
+; WIN-X86-NEXT:    pushl %esi
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esp, %eax
+; WIN-X86-NEXT:    pushl 40(%ebp)
+; WIN-X86-NEXT:    pushl 36(%ebp)
+; WIN-X86-NEXT:    pushl 32(%ebp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    calll ___multf3
+; WIN-X86-NEXT:    addl $36, %esp
+; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN-X86-NEXT:    movl %edi, 8(%esi)
+; WIN-X86-NEXT:    movl %edx, 12(%esi)
+; WIN-X86-NEXT:    movl %eax, (%esi)
+; WIN-X86-NEXT:    movl %ecx, 4(%esi)
+; WIN-X86-NEXT:    movl %esi, %eax
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebp
+; WIN-X86-NEXT:    retl
 entry:
   %mul = call fp128 @llvm.experimental.constrained.fmul.f128(fp128 %x, fp128 %y,  metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %mul
@@ -210,6 +363,55 @@ define fp128 @div(fp128 %x, fp128 %y) nounwind strictfp {
 ; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl $4
+;
+; WIN-LABEL: div:
+; WIN:       # %bb.0: # %entry
+; WIN-NEXT:    subq $72, %rsp
+; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    movaps (%rdx), %xmm1
+; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    callq __divtf3
+; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    retq
+;
+; WIN-X86-LABEL: div:
+; WIN-X86:       # %bb.0: # %entry
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %edi
+; WIN-X86-NEXT:    pushl %esi
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esp, %eax
+; WIN-X86-NEXT:    pushl 40(%ebp)
+; WIN-X86-NEXT:    pushl 36(%ebp)
+; WIN-X86-NEXT:    pushl 32(%ebp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    calll ___divtf3
+; WIN-X86-NEXT:    addl $36, %esp
+; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN-X86-NEXT:    movl %edi, 8(%esi)
+; WIN-X86-NEXT:    movl %edx, 12(%esi)
+; WIN-X86-NEXT:    movl %eax, (%esi)
+; WIN-X86-NEXT:    movl %ecx, 4(%esi)
+; WIN-X86-NEXT:    movl %esi, %eax
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebp
+; WIN-X86-NEXT:    retl
 entry:
   %div = call fp128 @llvm.experimental.constrained.fdiv.f128(fp128 %x, fp128 %y,  metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %div
@@ -258,6 +460,62 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp {
 ; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl $4
+;
+; WIN-LABEL: fma:
+; WIN:       # %bb.0: # %entry
+; WIN-NEXT:    subq $88, %rsp
+; WIN-NEXT:    movaps (%r8), %xmm0
+; WIN-NEXT:    movaps (%rcx), %xmm1
+; WIN-NEXT:    movaps (%rdx), %xmm2
+; WIN-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
+; WIN-NEXT:    callq fmal
+; WIN-NEXT:    addq $88, %rsp
+; WIN-NEXT:    retq
+;
+; WIN-X86-LABEL: fma:
+; WIN-X86:       # %bb.0: # %entry
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %edi
+; WIN-X86-NEXT:    pushl %esi
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esp, %eax
+; WIN-X86-NEXT:    pushl 56(%ebp)
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
+; WIN-X86-NEXT:    pushl 36(%ebp)
+; WIN-X86-NEXT:    pushl 32(%ebp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    calll _fmal
+; WIN-X86-NEXT:    addl $52, %esp
+; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN-X86-NEXT:    movl %edi, 8(%esi)
+; WIN-X86-NEXT:    movl %edx, 12(%esi)
+; WIN-X86-NEXT:    movl %eax, (%esi)
+; WIN-X86-NEXT:    movl %ecx, 4(%esi)
+; WIN-X86-NEXT:    movl %esi, %eax
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebp
+; WIN-X86-NEXT:    retl
 entry:
   %fma = call fp128 @llvm.experimental.constrained.fma.f128(fp128 %x, fp128 %y,  fp128 %z, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %fma
@@ -302,6 +560,55 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp {
 ; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl $4
+;
+; WIN-LABEL: frem:
+; WIN:       # %bb.0: # %entry
+; WIN-NEXT:    subq $72, %rsp
+; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    movaps (%rdx), %xmm1
+; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    callq fmodl
+; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    retq
+;
+; WIN-X86-LABEL: frem:
+; WIN-X86:       # %bb.0: # %entry
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %edi
+; WIN-X86-NEXT:    pushl %esi
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esp, %eax
+; WIN-X86-NEXT:    pushl 40(%ebp)
+; WIN-X86-NEXT:    pushl 36(%ebp)
+; WIN-X86-NEXT:    pushl 32(%ebp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    calll _fmodl
+; WIN-X86-NEXT:    addl $36, %esp
+; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN-X86-NEXT:    movl %edi, 8(%esi)
+; WIN-X86-NEXT:    movl %edx, 12(%esi)
+; WIN-X86-NEXT:    movl %eax, (%esi)
+; WIN-X86-NEXT:    movl %ecx, 4(%esi)
+; WIN-X86-NEXT:    movl %esi, %eax
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebp
+; WIN-X86-NEXT:    retl
 entry:
   %div = call fp128 @llvm.experimental.constrained.frem.f128(fp128 %x, fp128 %y,  metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %div
@@ -342,6 +649,48 @@ define fp128 @ceil(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl $4
+;
+; WIN-LABEL: ceil:
+; WIN:       # %bb.0: # %entry
+; WIN-NEXT:    subq $56, %rsp
+; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    callq ceill
+; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    retq
+;
+; WIN-X86-LABEL: ceil:
+; WIN-X86:       # %bb.0: # %entry
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %edi
+; WIN-X86-NEXT:    pushl %esi
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esp, %eax
+; WIN-X86-NEXT:    pushl 24(%ebp)
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    calll _ceill
+; WIN-X86-NEXT:    addl $20, %esp
+; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN-X86-NEXT:    movl %edi, 8(%esi)
+; WIN-X86-NEXT:    movl %edx, 12(%esi)
+; WIN-X86-NEXT:    movl %eax, (%esi)
+; WIN-X86-NEXT:    movl %ecx, 4(%esi)
+; WIN-X86-NEXT:    movl %esi, %eax
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebp
+; WIN-X86-NEXT:    retl
 entry:
   %ceil = call fp128 @llvm.experimental.constrained.ceil.f128(fp128 %x, metadata !"fpexcept.strict") #0
   ret fp128 %ceil
@@ -382,6 +731,48 @@ define fp128 @acos(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl $4
+;
+; WIN-LABEL: acos:
+; WIN:       # %bb.0: # %entry
+; WIN-NEXT:    subq $56, %rsp
+; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    callq acosl
+; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    retq
+;
+; WIN-X86-LABEL: acos:
+; WIN-X86:       # %bb.0: # %entry
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %edi
+; WIN-X86-NEXT:    pushl %esi
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esp, %eax
+; WIN-X86-NEXT:    pushl 24(%ebp)
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    calll _acosl
+; WIN-X86-NEXT:    addl $20, %esp
+; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN-X86-NEXT:    movl %edi, 8(%esi)
+; WIN-X86-NEXT:    movl %edx, 12(%esi)
+; WIN-X86-NEXT:    movl %eax, (%esi)
+; WIN-X86-NEXT:    movl %ecx, 4(%esi)
+; WIN-X86-NEXT:    movl %esi, %eax
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebp
+; WIN-X86-NEXT:    retl
 entry:
   %acos = call fp128 @llvm.experimental.constrained.acos.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %acos
@@ -422,6 +813,48 @@ define fp128 @cos(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl $4
+;
+; WIN-LABEL: cos:
+; WIN:       # %bb.0: # %entry
+; WIN-NEXT:    subq $56, %rsp
+; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    callq cosl
+; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    retq
+;
+; WIN-X86-LABEL: cos:
+; WIN-X86:       # %bb.0: # %entry
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %edi
+; WIN-X86-NEXT:    pushl %esi
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esp, %eax
+; WIN-X86-NEXT:    pushl 24(%ebp)
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    calll _cosl
+; WIN-X86-NEXT:    addl $20, %esp
+; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN-X86-NEXT:    movl %edi, 8(%esi)
+; WIN-X86-NEXT:    movl %edx, 12(%esi)
+; WIN-X86-NEXT:    movl %eax, (%esi)
+; WIN-X86-NEXT:    movl %ecx, 4(%esi)
+; WIN-X86-NEXT:    movl %esi, %eax
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebp
+; WIN-X86-NEXT:    retl
 entry:
   %cos = call fp128 @llvm.experimental.constrained.cos.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %cos
@@ -462,6 +895,48 @@ define fp128 @cosh(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl $4
+;
+; WIN-LABEL: cosh:
+; WIN:       # %bb.0: # %entry
+; WIN-NEXT:    subq $56, %rsp
+; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    callq coshl
+; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    retq
+;
+; WIN-X86-LABEL: cosh:
+; WIN-X86:       # %bb.0: # %entry
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %edi
+; WIN-X86-NEXT:    pushl %esi
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esp, %eax
+; WIN-X86-NEXT:    pushl 24(%ebp)
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    calll _coshl
+; WIN-X86-NEXT:    addl $20, %esp
+; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN-X86-NEXT:    movl %edi, 8(%esi)
+; WIN-X86-NEXT:    movl %edx, 12(%esi)
+; WIN-X86-NEXT:    movl %eax, (%esi)
+; WIN-X86-NEXT:    movl %ecx, 4(%esi)
+; WIN-X86-NEXT:    movl %esi, %eax
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebp
+; WIN-X86-NEXT:    retl
 entry:
   %cosh = call fp128 @llvm.experim...
[truncated]

tgross35 · 2025-02-26T20:37:21Z

@rnk in #115052 (comment) you mentioned that changes in X86ISelLowering may be needed - any idea what these would look like? The test changes here seem fine but I'm also not sure what failure modes I should be looking for.

rnk · 2025-02-27T23:45:54Z

I went looking at X86TargetLowering::LowerWin64_i128OP. I think that logic exists to manually implement the calling convention logic that you've implemented here, and perhaps we just don't need this logic anymore. I can't think of any specific concerns.

…, r=<try> [do not merge] Windows f128 abi experiment Running tests with llvm/llvm-project#115052 and llvm/llvm-project#128848. r? `@ghost` try-job: dist-x86_64-msvc try-job: dist-x86_64-mingw try-job: x86_64-msvc-1 try-job: x86_64-msvc-2 try-job: x86_64-mingw-1 try-job: x86_64-mingw-2 try-job: x86_64-mingw-3

…, r=<try> [do not merge] Windows f128 abi experiment Running tests with llvm/llvm-project#115052 and llvm/llvm-project#128848. r? `@ghost` try-job: dist-x86_64-msvc try-job: dist-x86_64-mingw try-job: x86_64-msvc-1 try-job: x86_64-msvc-2 try-job: x86_64-mingw-1 try-job: x86_64-mingw-2

tgross35 · 2025-02-28T09:31:28Z

Thanks for looking into that. I think this should be all set then, along with #115052

…, r=<try> [do not merge] Windows f128 abi experiment Running tests with llvm/llvm-project#115052 and llvm/llvm-project#128848. r? `@ghost` try-job: dist-x86_64-msvc try-job: dist-x86_64-mingw try-job: x86_64-msvc-1 try-job: x86_64-msvc-2 try-job: x86_64-mingw-1 try-job: x86_64-mingw-2

tgross35 · 2025-03-01T00:32:56Z

Thanks for reviewing, I need somebody to land this for me.

First commit for both PRs is NFC

@rnk

Clang currently passes and returns `__float128` in vector registers on MinGW targets, which is LLVM's default ABI for `fp128`. However, the Windows x86-64 calling convention [1] states the following: __m128 types, arrays, and strings are never passed by immediate value. Instead, a pointer is passed to memory allocated by the caller. Structs and unions of size 8, 16, 32, or 64 bits, and __m64 types, are passed as if they were integers of the same size. Structs or unions of other sizes are passed as a pointer to memory allocated by the caller. For these aggregate types passed as a pointer, including __m128, the caller-allocated temporary memory must be 16-byte aligned. Based on the above it sounds like `__float128` should be passed indirectly. Thus, change `f128` passing to use the stack and make the return in xmm0 explicit. This is the identical to `i128`, and passing is the same as GCC. Regarding return values, the documentation states: A scalar return value that can fit into 64 bits, including the __m64 type, is returned through RAX. Non-scalar types including floats, doubles, and vector types such as __m128, __m128i, __m128d are returned in XMM0. This makes it sound like it should be acceptable to return `__float128` in xmm0; however, GCC returns `__float128` on the stack. That above ABI statement as well as consistency with `i128` (which is returned in xmm0) mean that it would likely be better for GCC to change its return ABI to match Clang rather than the other way around, so that portion is left as-is. Clang's MSVC targets do not support `__float128` or `_Float128`, but these changes would also apply there if it is eventually enabled. With [2] which should land around the same time, LLVM will also implement this ABI so it is not technically necessary for Clang to make a change here as well. This is sill done in order to be consistent with other types, and to allow calling convention-aware optimizations at all available optimization layers (@rnk mentioned possible reuse of stack arguments). An added benefit is readibility of the LLVM IR since it more accurately reflects what the lowered assembly does. [1]: https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170 [2]: llvm#128848

Clang currently passes and returns `__float128` in vector registers on MinGW targets, which is LLVM's default ABI for `fp128`. However, the Windows x86-64 calling convention [1] states the following: __m128 types, arrays, and strings are never passed by immediate value. Instead, a pointer is passed to memory allocated by the caller. Structs and unions of size 8, 16, 32, or 64 bits, and __m64 types, are passed as if they were integers of the same size. Structs or unions of other sizes are passed as a pointer to memory allocated by the caller. For these aggregate types passed as a pointer, including __m128, the caller-allocated temporary memory must be 16-byte aligned. Based on the above it sounds like `__float128` should be passed indirectly. Thus, change `f128` passing to use the stack and make the return in xmm0 explicit. This is the identical to `i128`, and passing is the same as GCC. Regarding return values, the documentation states: A scalar return value that can fit into 64 bits, including the __m64 type, is returned through RAX. Non-scalar types including floats, doubles, and vector types such as __m128, __m128i, __m128d are returned in XMM0. This makes it sound like it should be acceptable to return `__float128` in xmm0; however, GCC returns `__float128` on the stack. That above ABI statement as well as consistency with `i128` (which is returned in xmm0) mean that it would likely be better for GCC to change its return ABI to match Clang rather than the other way around, so that portion is left as-is. Clang's MSVC targets do not support `__float128` or `_Float128`, but these changes would also apply there if it is eventually enabled. With [2] which should land around the same time, LLVM will also implement this ABI so it is not technically necessary for Clang to make a change here as well. This is sill done in order to be consistent with other types, and to allow calling convention-aware optimizations at all available optimization layers (rnk mentioned possible reuse of stack arguments). An added benefit is readibility of the LLVM IR since it more accurately reflects what the lowered assembly does. [1]: https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170 [2]: llvm#128848

@rnk

Clang currently passes and returns `__float128` in vector registers on MinGW targets, which is LLVM's default ABI for `fp128`. However, the Windows x86-64 calling convention [1] states the following: __m128 types, arrays, and strings are never passed by immediate value. Instead, a pointer is passed to memory allocated by the caller. Structs and unions of size 8, 16, 32, or 64 bits, and __m64 types, are passed as if they were integers of the same size. Structs or unions of other sizes are passed as a pointer to memory allocated by the caller. For these aggregate types passed as a pointer, including __m128, the caller-allocated temporary memory must be 16-byte aligned. Based on the above it sounds like `__float128` should be passed indirectly. Thus, change `f128` passing to use the stack and make the return in xmm0 explicit. This is the identical to `i128`, and passing is the same as GCC. Regarding return values, the documentation states: A scalar return value that can fit into 64 bits, including the __m64 type, is returned through RAX. Non-scalar types including floats, doubles, and vector types such as __m128, __m128i, __m128d are returned in XMM0. This makes it sound like it should be acceptable to return `__float128` in xmm0; however, GCC returns `__float128` on the stack. That above ABI statement as well as consistency with `i128` (which is returned in xmm0) mean that it would likely be better for GCC to change its return ABI to match Clang rather than the other way around, so that portion is left as-is. Clang's MSVC targets do not support `__float128` or `_Float128`, but these changes would also apply there if it is eventually enabled. With [2] which should land around the same time, LLVM will also implement this ABI so it is not technically necessary for Clang to make a change here as well. This is sill done in order to be consistent with other types, and to allow calling convention-aware optimizations at all available optimization layers (@rnk mentioned possible reuse of stack arguments). An added benefit is readibility of the LLVM IR since it more accurately reflects what the lowered assembly does. [1]: https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170 [2]: #128848

@rnk

…s (#115052) Clang currently passes and returns `__float128` in vector registers on MinGW targets, which is LLVM's default ABI for `fp128`. However, the Windows x86-64 calling convention [1] states the following: __m128 types, arrays, and strings are never passed by immediate value. Instead, a pointer is passed to memory allocated by the caller. Structs and unions of size 8, 16, 32, or 64 bits, and __m64 types, are passed as if they were integers of the same size. Structs or unions of other sizes are passed as a pointer to memory allocated by the caller. For these aggregate types passed as a pointer, including __m128, the caller-allocated temporary memory must be 16-byte aligned. Based on the above it sounds like `__float128` should be passed indirectly. Thus, change `f128` passing to use the stack and make the return in xmm0 explicit. This is the identical to `i128`, and passing is the same as GCC. Regarding return values, the documentation states: A scalar return value that can fit into 64 bits, including the __m64 type, is returned through RAX. Non-scalar types including floats, doubles, and vector types such as __m128, __m128i, __m128d are returned in XMM0. This makes it sound like it should be acceptable to return `__float128` in xmm0; however, GCC returns `__float128` on the stack. That above ABI statement as well as consistency with `i128` (which is returned in xmm0) mean that it would likely be better for GCC to change its return ABI to match Clang rather than the other way around, so that portion is left as-is. Clang's MSVC targets do not support `__float128` or `_Float128`, but these changes would also apply there if it is eventually enabled. With [2] which should land around the same time, LLVM will also implement this ABI so it is not technically necessary for Clang to make a change here as well. This is sill done in order to be consistent with other types, and to allow calling convention-aware optimizations at all available optimization layers (@rnk mentioned possible reuse of stack arguments). An added benefit is readibility of the LLVM IR since it more accurately reflects what the lowered assembly does. [1]: https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170 [2]: llvm/llvm-project#128848

tgross35 · 2025-03-06T09:48:01Z

@phoebewang would you mind getting this one too?

The first commit is NFC and should land separate

phoebewang · 2025-03-06T10:19:14Z

I see some problems here:

The tile says it's for windows but description says for both. So the title is incorrect?
I'd expect more tests affected by this change, e.g., https://github.com/llvm/llvm-project/blob/main/llvm/test/CodeGen/X86/fp128-cast.ll#L1059 should store into memory and pass to __gttf2, right?
ABI change needs to be documented in ReleaseNote

Add Windows invocations to existing fp128 libcall tests and create a new ABI test for x86.

LLVM currently expects `__float128` to be both passed and returned in xmm registers on Windows. However, this disagrees with the Windows x86-64 calling convention [1], which indicates values larger than 64 bits should be passed indirectly. Update LLVM's default Windows calling convention to pass `fp128` directly. Returning in xmm0 is unchanged since this seems like a reasonable extrapolation of the ABI. With this patch, the calling convention for `i128` and `f128` is the same. GCC passes `__float128` indirectly, which this also matches. However, it also returns indirectly, which is not done here. I intend to attempt a GCC change to also return in `xmm0` rather than making that change here, given the consistency with `i128`. This corresponds to the frontend change in [2], see more details there. [1]: https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170 [2]: llvm#115052

tgross35 · 2025-03-06T10:40:03Z

I see some problems here:

* The tile says it's for windows but description says for both. So the title is incorrect?

This is Windows-only. Were you referring to "Update LLVM's default calling convention" in the description? I meant this to refer to the default Windows CC used for libcalls, updated this to be more clear.

The sysv ABI actually specifies __float128 as SSE+SSEUP and LLVM already does the right thing there.

* I'd expect more tests affected by this change, e.g., https://github.com/llvm/llvm-project/blob/main/llvm/test/CodeGen/X86/fp128-cast.ll#L1059 should store into memory and pass to __gttf2, right?

That test only runs Linux, should I add the Windows targets? Some builtins calls are covered in llvm/test/CodeGen/X86/fp128-libcalls.ll, e.g. the call to __addtf3 here e0a3738#diff-ea9428e87a77d0898c35fe6d9ff99f0ed9ba709ec232da43bf9fe0915894ce3cL159-R176

* ABI change needs to be documented in ReleaseNote

Done. Also rebased so this is on top of #115052.

phoebewang · 2025-03-06T12:13:15Z

That test only runs Linux, should I add the Windows targets? Some builtins calls are covered in llvm/test/CodeGen/X86/fp128-libcalls.ll, e.g. the call to __addtf3 here e0a3738#diff-ea9428e87a77d0898c35fe6d9ff99f0ed9ba709ec232da43bf9fe0915894ce3cL159-R176

No, I just read the patch as it affects both Linux and Windows. It looks good to me if it's Windows only.

github-actions · 2025-03-06T12:14:47Z

@tgross35 Congratulations on having your first Pull Request (PR) merged into the LLVM Project!

Your changes will be combined with recent changes from other authors, then tested by our build bots. If there is a problem with a build, you may receive a report in an email or a comment on this PR.

Please check whether problems have been caused by your change specifically, as the builds can include changes from many authors. It is not uncommon for your change to be included in a build that fails due to someone else's changes, or infrastructure issues.

How to do this, and the rest of the post-merge process, is covered in detail here.

If your change does cause a problem, it may be reverted, or you can revert it yourself. This is a normal part of LLVM development. You can fix your changes and open a new PR to merge them again.

If you don't get any reports, no action is required from you. Your changes are working as expected, well done!

@rnk

) Clang currently passes and returns `__float128` in vector registers on MinGW targets, which is LLVM's default ABI for `fp128`. However, the Windows x86-64 calling convention [1] states the following: __m128 types, arrays, and strings are never passed by immediate value. Instead, a pointer is passed to memory allocated by the caller. Structs and unions of size 8, 16, 32, or 64 bits, and __m64 types, are passed as if they were integers of the same size. Structs or unions of other sizes are passed as a pointer to memory allocated by the caller. For these aggregate types passed as a pointer, including __m128, the caller-allocated temporary memory must be 16-byte aligned. Based on the above it sounds like `__float128` should be passed indirectly. Thus, change `f128` passing to use the stack and make the return in xmm0 explicit. This is the identical to `i128`, and passing is the same as GCC. Regarding return values, the documentation states: A scalar return value that can fit into 64 bits, including the __m64 type, is returned through RAX. Non-scalar types including floats, doubles, and vector types such as __m128, __m128i, __m128d are returned in XMM0. This makes it sound like it should be acceptable to return `__float128` in xmm0; however, GCC returns `__float128` on the stack. That above ABI statement as well as consistency with `i128` (which is returned in xmm0) mean that it would likely be better for GCC to change its return ABI to match Clang rather than the other way around, so that portion is left as-is. Clang's MSVC targets do not support `__float128` or `_Float128`, but these changes would also apply there if it is eventually enabled. With [2] which should land around the same time, LLVM will also implement this ABI so it is not technically necessary for Clang to make a change here as well. This is sill done in order to be consistent with other types, and to allow calling convention-aware optimizations at all available optimization layers (@rnk mentioned possible reuse of stack arguments). An added benefit is readibility of the LLVM IR since it more accurately reflects what the lowered assembly does. [1]: https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170 [2]: llvm#128848

LLVM currently expects `__float128` to be both passed and returned in xmm registers on Windows. However, this disagrees with the Windows x86-64 calling convention [1], which indicates values larger than 64 bits should be passed indirectly. Update LLVM's default Windows calling convention to pass `fp128` directly. Returning in xmm0 is unchanged since this seems like a reasonable extrapolation of the ABI. With this patch, the calling convention for `i128` and `f128` is the same. GCC passes `__float128` indirectly, which this also matches. However, it also returns indirectly, which is not done here. I intend to attempt a GCC change to also return in `xmm0` rather than making that change here, given the consistency with `i128`. This corresponds to the frontend change in [2], see more details there. [1]: https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170 [2]: llvm#115052

llvmbot added the backend:X86 label Feb 26, 2025

tgross35 marked this pull request as draft February 26, 2025 09:47

tgross35 force-pushed the windows-f128-libcall-abi branch from f6d8e8b to eea0a28 Compare February 26, 2025 20:35

tgross35 force-pushed the windows-f128-libcall-abi branch 2 times, most recently from f57b8a2 to 71e42dc Compare February 28, 2025 05:17

tgross35 marked this pull request as ready for review February 28, 2025 05:20

tgross35 force-pushed the windows-f128-libcall-abi branch from 71e42dc to b6c7942 Compare February 28, 2025 08:52

tgross35 mentioned this pull request Feb 28, 2025

[do not merge] Windows f128 abi experiment rust-lang/rust#137779

Closed

tgross35 changed the title ~~[windows] Always pass libcall fp128 arguments indirectly~~ [windows] Always pass fp128 arguments indirectly Feb 28, 2025

tgross35 mentioned this pull request Feb 28, 2025

[clang] Pass fp128 indirectly and return in xmm0 on Windows #115052

Merged

rnk approved these changes Mar 1, 2025

View reviewed changes

[windows] Add fp128 ABI tests and Windows libcall tests (NFC)

1e17464

Add Windows invocations to existing fp128 libcall tests and create a new ABI test for x86.

tgross35 force-pushed the windows-f128-libcall-abi branch from b6c7942 to 5187cfd Compare March 6, 2025 10:23

tgross35 force-pushed the windows-f128-libcall-abi branch from 5187cfd to e0a3738 Compare March 6, 2025 10:27

phoebewang merged commit 5ee1c0b into llvm:main Mar 6, 2025
10 of 11 checks passed

tgross35 deleted the windows-f128-libcall-abi branch March 6, 2025 20:31

This was referenced Apr 22, 2025

Add f16/f128 support rust-lang/rustc_codegen_cranelift#1574

Merged

Update F128 argument ABI for windows_fastcall to match LLVM 21 bytecodealliance/wasmtime#10678

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[windows] Always pass fp128 arguments indirectly #128848

[windows] Always pass fp128 arguments indirectly #128848

Uh oh!

tgross35 commented Feb 26, 2025 •

edited

Loading

Uh oh!

github-actions bot commented Feb 26, 2025

Uh oh!

llvmbot commented Feb 26, 2025

Uh oh!

tgross35 commented Feb 26, 2025

Uh oh!

rnk commented Feb 27, 2025

Uh oh!

tgross35 commented Feb 28, 2025 •

edited

Loading

Uh oh!

tgross35 commented Mar 1, 2025

Uh oh!

tgross35 commented Mar 6, 2025

Uh oh!

phoebewang commented Mar 6, 2025

Uh oh!

tgross35 commented Mar 6, 2025

Uh oh!

phoebewang commented Mar 6, 2025

Uh oh!

Uh oh!

github-actions bot commented Mar 6, 2025

Uh oh!

Uh oh!

[windows] Always pass fp128 arguments indirectly #128848

[windows] Always pass fp128 arguments indirectly #128848

Uh oh!

Conversation

tgross35 commented Feb 26, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented Feb 26, 2025

Uh oh!

llvmbot commented Feb 26, 2025

Uh oh!

tgross35 commented Feb 26, 2025

Uh oh!

rnk commented Feb 27, 2025

Uh oh!

tgross35 commented Feb 28, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

tgross35 commented Mar 1, 2025

Uh oh!

tgross35 commented Mar 6, 2025

Uh oh!

phoebewang commented Mar 6, 2025

Uh oh!

tgross35 commented Mar 6, 2025

Uh oh!

phoebewang commented Mar 6, 2025

Uh oh!

Uh oh!

github-actions bot commented Mar 6, 2025

Uh oh!

Uh oh!

tgross35 commented Feb 26, 2025 •

edited

Loading

tgross35 commented Feb 28, 2025 •

edited

Loading