fix `llvm.fma.f16` double rounding issue when there is no native support #171904

folkertdev · 2025-12-11T21:04:30Z

As the issue describes, promoting llvm.fma.f16 to llvm.fma.f32 does not work, because there is not enough precision to handle the repeated rounding. f64 does have sufficient space. So this PR explicitly promotes the 16-bit fma to a 64-bit fma.

I could not find examples of a libcall being used for fma, but that's something that could be looked in separately to work around code size issues.

llvmbot · 2025-12-11T21:05:07Z

@llvm/pr-subscribers-backend-nvptx
@llvm/pr-subscribers-backend-aarch64
@llvm/pr-subscribers-backend-x86
@llvm/pr-subscribers-backend-risc-v
@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-backend-systemz

Author: Folkert de Vries (folkertdev)

Changes

fixes #98389

As the issue describes, promoting llvm.fma.f16 to llvm.fma.f32 does not work, because there is not enough precision to handle the repeated rounding. f64 does have sufficient space. So this PR explicitly promotes the 16-bit fma to a 64-bit fma.

I could not find examples of a libcall being used for fma, but that's something that could be looked in separately to work around code size issues.

Full diff: https://github.com/llvm/llvm-project/pull/171904.diff

5 Files Affected:

(modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp (+36-5)
(modified) llvm/test/CodeGen/ARM/fp16-promote.ll (+50-30)
(modified) llvm/test/CodeGen/SystemZ/fp-mul-06.ll (+17-6)
(modified) llvm/test/CodeGen/SystemZ/fp-mul-08.ll (+6-6)
(modified) llvm/test/CodeGen/SystemZ/fp-mul-10.ll (+10-10)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index dcf2df305d24a..4c74cc9ebe061 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6919,12 +6919,43 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                              getValue(I.getArgOperand(0)), Flags));
     return;
   }
-  case Intrinsic::fma:
-    setValue(&I, DAG.getNode(
-                     ISD::FMA, sdl, getValue(I.getArgOperand(0)).getValueType(),
-                     getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)),
-                     getValue(I.getArgOperand(2)), Flags));
+  case Intrinsic::fma: {
+    SDValue A = getValue(I.getArgOperand(0));
+    SDValue B = getValue(I.getArgOperand(1));
+    SDValue C = getValue(I.getArgOperand(2));
+
+    Type *Ty = I.getType();
+    EVT VT = TLI.getValueType(DAG.getDataLayout(), Ty);
+    if (Ty->isHalfTy() && !TLI.isOperationLegalOrCustom(ISD::FMA, VT)) {
+      // An f16 fma must go via f64 to prevent double rounding issues.
+
+      EVT HalfVT = VT;
+      EVT DoubleVT = MVT::f64;
+
+      SDValue A64 = DAG.getNode(ISD::FP_EXTEND, sdl, DoubleVT, A);
+      SDValue B64 = DAG.getNode(ISD::FP_EXTEND, sdl, DoubleVT, B);
+      SDValue C64 = DAG.getNode(ISD::FP_EXTEND, sdl, DoubleVT, C);
+
+      // Prefer FMA in double if the target has it (optimizes better).
+      SDValue Fma64;
+      if (TLI.isOperationLegalOrCustom(ISD::FMA, DoubleVT)) {
+        Fma64 = DAG.getNode(ISD::FMA, sdl, DoubleVT, A64, B64, C64, Flags);
+      } else {
+        SDValue Mul = DAG.getNode(ISD::FMUL, sdl, DoubleVT, A64, B64, Flags);
+        Fma64 = DAG.getNode(ISD::FADD, sdl, DoubleVT, Mul, C64, Flags);
+      }
+
+      SDValue ResHalf =
+          DAG.getNode(ISD::FP_ROUND, sdl, HalfVT, Fma64,
+                      DAG.getIntPtrConstant(0, sdl, /*isTarget=*/true));
+
+      setValue(&I, ResHalf);
+    } else {
+      setValue(&I,
+               DAG.getNode(ISD::FMA, sdl, A.getValueType(), A, B, C, Flags));
+    }
     return;
+  }
 #define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)                         \
   case Intrinsic::INTRINSIC:
 #include "llvm/IR/ConstrainedOps.def"
diff --git a/llvm/test/CodeGen/ARM/fp16-promote.ll b/llvm/test/CodeGen/ARM/fp16-promote.ll
index 8230e47259dd8..27a0bf2eb9037 100644
--- a/llvm/test/CodeGen/ARM/fp16-promote.ll
+++ b/llvm/test/CodeGen/ARM/fp16-promote.ll
@@ -1508,61 +1508,81 @@ define void @test_fma(ptr %p, ptr %q, ptr %r) #0 {
 ; CHECK-FP16-NEXT:    push {r4, lr}
 ; CHECK-FP16-NEXT:    mov r4, r0
 ; CHECK-FP16-NEXT:    ldrh r0, [r1]
-; CHECK-FP16-NEXT:    ldrh r1, [r4]
-; CHECK-FP16-NEXT:    ldrh r2, [r2]
-; CHECK-FP16-NEXT:    vmov s2, r0
+; CHECK-FP16-NEXT:    ldrh r1, [r2]
+; CHECK-FP16-NEXT:    vmov s0, r0
+; CHECK-FP16-NEXT:    ldrh r0, [r4]
+; CHECK-FP16-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-FP16-NEXT:    vcvt.f64.f32 d16, s0
+; CHECK-FP16-NEXT:    vmov s0, r0
+; CHECK-FP16-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-FP16-NEXT:    vcvt.f64.f32 d17, s0
 ; CHECK-FP16-NEXT:    vmov s0, r1
-; CHECK-FP16-NEXT:    vcvtb.f32.f16 s1, s2
-; CHECK-FP16-NEXT:    vmov s2, r2
 ; CHECK-FP16-NEXT:    vcvtb.f32.f16 s0, s0
-; CHECK-FP16-NEXT:    vcvtb.f32.f16 s2, s2
-; CHECK-FP16-NEXT:    bl fmaf
-; CHECK-FP16-NEXT:    vcvtb.f16.f32 s0, s0
-; CHECK-FP16-NEXT:    vmov r0, s0
+; CHECK-FP16-NEXT:    vcvt.f64.f32 d18, s0
+; CHECK-FP16-NEXT:    vmla.f64 d18, d17, d16
+; CHECK-FP16-NEXT:    vmov r0, r1, d18
+; CHECK-FP16-NEXT:    bl __aeabi_d2h
 ; CHECK-FP16-NEXT:    strh r0, [r4]
 ; CHECK-FP16-NEXT:    pop {r4, pc}
 ;
 ; CHECK-LIBCALL-VFP-LABEL: test_fma:
 ; CHECK-LIBCALL-VFP:         .save {r4, r5, r6, lr}
 ; CHECK-LIBCALL-VFP-NEXT:    push {r4, r5, r6, lr}
+; CHECK-LIBCALL-VFP-NEXT:    .vsave {d8, d9}
+; CHECK-LIBCALL-VFP-NEXT:    vpush {d8, d9}
 ; CHECK-LIBCALL-VFP-NEXT:    mov r4, r0
-; CHECK-LIBCALL-VFP-NEXT:    ldrh r0, [r2]
-; CHECK-LIBCALL-VFP-NEXT:    mov r5, r1
+; CHECK-LIBCALL-VFP-NEXT:    ldrh r0, [r0]
+; CHECK-LIBCALL-VFP-NEXT:    mov r5, r2
+; CHECK-LIBCALL-VFP-NEXT:    mov r6, r1
 ; CHECK-LIBCALL-VFP-NEXT:    bl __aeabi_h2f
-; CHECK-LIBCALL-VFP-NEXT:    mov r6, r0
-; CHECK-LIBCALL-VFP-NEXT:    ldrh r0, [r5]
+; CHECK-LIBCALL-VFP-NEXT:    ldrh r1, [r6]
+; CHECK-LIBCALL-VFP-NEXT:    vmov s16, r0
+; CHECK-LIBCALL-VFP-NEXT:    ldrh r5, [r5]
+; CHECK-LIBCALL-VFP-NEXT:    mov r0, r1
 ; CHECK-LIBCALL-VFP-NEXT:    bl __aeabi_h2f
-; CHECK-LIBCALL-VFP-NEXT:    mov r5, r0
-; CHECK-LIBCALL-VFP-NEXT:    ldrh r0, [r4]
+; CHECK-LIBCALL-VFP-NEXT:    vmov s18, r0
+; CHECK-LIBCALL-VFP-NEXT:    mov r0, r5
 ; CHECK-LIBCALL-VFP-NEXT:    bl __aeabi_h2f
 ; CHECK-LIBCALL-VFP-NEXT:    vmov s0, r0
-; CHECK-LIBCALL-VFP-NEXT:    vmov s1, r5
-; CHECK-LIBCALL-VFP-NEXT:    vmov s2, r6
-; CHECK-LIBCALL-VFP-NEXT:    bl fmaf
-; CHECK-LIBCALL-VFP-NEXT:    vmov r0, s0
-; CHECK-LIBCALL-VFP-NEXT:    bl __aeabi_f2h
+; CHECK-LIBCALL-VFP-NEXT:    vcvt.f64.f32 d16, s18
+; CHECK-LIBCALL-VFP-NEXT:    vcvt.f64.f32 d17, s16
+; CHECK-LIBCALL-VFP-NEXT:    vcvt.f64.f32 d18, s0
+; CHECK-LIBCALL-VFP-NEXT:    vmla.f64 d18, d17, d16
+; CHECK-LIBCALL-VFP-NEXT:    vmov r0, r1, d18
+; CHECK-LIBCALL-VFP-NEXT:    bl __aeabi_d2h
 ; CHECK-LIBCALL-VFP-NEXT:    strh r0, [r4]
+; CHECK-LIBCALL-VFP-NEXT:    vpop {d8, d9}
 ; CHECK-LIBCALL-VFP-NEXT:    pop {r4, r5, r6, pc}
 ;
 ; CHECK-NOVFP-LABEL: test_fma:
-; CHECK-NOVFP:         .save {r4, r5, r6, lr}
-; CHECK-NOVFP-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NOVFP:         .save {r4, r5, r6, r7, r11, lr}
+; CHECK-NOVFP-NEXT:    push {r4, r5, r6, r7, r11, lr}
 ; CHECK-NOVFP-NEXT:    mov r4, r0
 ; CHECK-NOVFP-NEXT:    ldrh r0, [r1]
 ; CHECK-NOVFP-NEXT:    mov r5, r2
 ; CHECK-NOVFP-NEXT:    bl __aeabi_h2f
+; CHECK-NOVFP-NEXT:    bl __aeabi_f2d
 ; CHECK-NOVFP-NEXT:    mov r6, r0
-; CHECK-NOVFP-NEXT:    ldrh r0, [r5]
-; CHECK-NOVFP-NEXT:    bl __aeabi_h2f
-; CHECK-NOVFP-NEXT:    mov r5, r0
 ; CHECK-NOVFP-NEXT:    ldrh r0, [r4]
+; CHECK-NOVFP-NEXT:    mov r7, r1
 ; CHECK-NOVFP-NEXT:    bl __aeabi_h2f
-; CHECK-NOVFP-NEXT:    mov r1, r6
-; CHECK-NOVFP-NEXT:    mov r2, r5
-; CHECK-NOVFP-NEXT:    bl fmaf
-; CHECK-NOVFP-NEXT:    bl __aeabi_f2h
+; CHECK-NOVFP-NEXT:    bl __aeabi_f2d
+; CHECK-NOVFP-NEXT:    mov r2, r6
+; CHECK-NOVFP-NEXT:    mov r3, r7
+; CHECK-NOVFP-NEXT:    bl __aeabi_dmul
+; CHECK-NOVFP-NEXT:    mov r6, r0
+; CHECK-NOVFP-NEXT:    ldrh r0, [r5]
+; CHECK-NOVFP-NEXT:    mov r7, r1
+; CHECK-NOVFP-NEXT:    bl __aeabi_h2f
+; CHECK-NOVFP-NEXT:    bl __aeabi_f2d
+; CHECK-NOVFP-NEXT:    mov r2, r0
+; CHECK-NOVFP-NEXT:    mov r3, r1
+; CHECK-NOVFP-NEXT:    mov r0, r6
+; CHECK-NOVFP-NEXT:    mov r1, r7
+; CHECK-NOVFP-NEXT:    bl __aeabi_dadd
+; CHECK-NOVFP-NEXT:    bl __aeabi_d2h
 ; CHECK-NOVFP-NEXT:    strh r0, [r4]
-; CHECK-NOVFP-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NOVFP-NEXT:    pop {r4, r5, r6, r7, r11, pc}
   %a = load half, ptr %p, align 2
   %b = load half, ptr %q, align 2
   %c = load half, ptr %r, align 2
diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-06.ll b/llvm/test/CodeGen/SystemZ/fp-mul-06.ll
index 6b285a49057dc..f46fb92c59ba5 100644
--- a/llvm/test/CodeGen/SystemZ/fp-mul-06.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-mul-06.ll
@@ -5,15 +5,16 @@
 
 declare half @llvm.fma.f16(half %f1, half %f2, half %f3)
 declare float @llvm.fma.f32(float %f1, float %f2, float %f3)
+declare double @llvm.fma.f64(double %f1, double %f2, double %f3)
 
 define half @f0(half %f1, half %f2, half %acc) {
 ; CHECK-LABEL: f0:
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK-SCALAR: maebr %f0, %f9, %f10
-; CHECK-VECTOR: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK-SCALAR: madbr %f0, %f9, %f10
+; CHECK-VECTOR: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2@PLT
 ; CHECK: br %r14
   %res = call half @llvm.fma.f16 (half %f1, half %f2, half %acc)
   ret half %res
@@ -29,6 +30,16 @@ define float @f1(float %f1, float %f2, float %acc) {
   ret float %res
 }
 
+define double @f9(double %f1, double %f2, double %acc) {
+; CHECK-LABEL: f9:
+; CHECK-SCALAR: madbr %f4, %f0, %f2
+; CHECK-SCALAR: ldr %f0, %f4
+; CHECK-VECTOR: wfmadb %f0, %f0, %f2, %f4
+; CHECK: br %r14
+  %res = call double @llvm.fma.f64 (double %f1, double %f2, double %acc)
+  ret double %res
+}
+
 define float @f2(float %f1, ptr %ptr, float %acc) {
 ; CHECK-LABEL: f2:
 ; CHECK: maeb %f2, %f0, 0(%r2)
diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-08.ll b/llvm/test/CodeGen/SystemZ/fp-mul-08.ll
index e739bddd4f18f..542cae41d4745 100644
--- a/llvm/test/CodeGen/SystemZ/fp-mul-08.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-mul-08.ll
@@ -10,12 +10,12 @@ define half @f0(half %f1, half %f2, half %acc) {
 ; CHECK-LABEL: f0:
 ; CHECK-NOT: brasl
 ; CHECK: lcdfr %f{{[0-9]+}}, %f4
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK-SCALAR: maebr %f0, %f8, %f10
-; CHECK-VECTOR: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK-SCALAR: madbr %f0, %f8, %f10
+; CHECK-VECTOR: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2@PLT
 ; CHECK: br %r14
   %negacc = fneg half %acc
   %res = call half @llvm.fma.f16 (half %f1, half %f2, half %negacc)
diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-10.ll b/llvm/test/CodeGen/SystemZ/fp-mul-10.ll
index 8f2cd23112cd0..0badf2993cca7 100644
--- a/llvm/test/CodeGen/SystemZ/fp-mul-10.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-mul-10.ll
@@ -25,11 +25,11 @@ define double @f2(double %f1, double %f2, double %acc) {
 
 define half @f3_half(half %f1, half %f2, half %acc) {
 ; CHECK-LABEL: f3_half:
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2@PLT
 ; CHECK-NOT: brasl
 ; CHECK:      lcdfr %f0, %f0
 ; CHECK-NEXT: lmg
@@ -52,11 +52,11 @@ define half @f4_half(half %f1, half %f2, half %acc) {
 ; CHECK-LABEL: f4_half:
 ; CHECK-NOT: brasl
 ; CHECK: lcdfr %f0, %f4
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2@PLT
 ; CHECK-NOT: brasl
 ; CHECK:      lcdfr %f0, %f0
 ; CHECK-NEXT: lmg

llvmbot · 2025-12-11T21:05:08Z

@llvm/pr-subscribers-llvm-selectiondag

Author: Folkert de Vries (folkertdev)

Changes

fixes #98389

As the issue describes, promoting llvm.fma.f16 to llvm.fma.f32 does not work, because there is not enough precision to handle the repeated rounding. f64 does have sufficient space. So this PR explicitly promotes the 16-bit fma to a 64-bit fma.

I could not find examples of a libcall being used for fma, but that's something that could be looked in separately to work around code size issues.

Full diff: https://github.com/llvm/llvm-project/pull/171904.diff

5 Files Affected:

(modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp (+36-5)
(modified) llvm/test/CodeGen/ARM/fp16-promote.ll (+50-30)
(modified) llvm/test/CodeGen/SystemZ/fp-mul-06.ll (+17-6)
(modified) llvm/test/CodeGen/SystemZ/fp-mul-08.ll (+6-6)
(modified) llvm/test/CodeGen/SystemZ/fp-mul-10.ll (+10-10)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index dcf2df305d24a..4c74cc9ebe061 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6919,12 +6919,43 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                              getValue(I.getArgOperand(0)), Flags));
     return;
   }
-  case Intrinsic::fma:
-    setValue(&I, DAG.getNode(
-                     ISD::FMA, sdl, getValue(I.getArgOperand(0)).getValueType(),
-                     getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)),
-                     getValue(I.getArgOperand(2)), Flags));
+  case Intrinsic::fma: {
+    SDValue A = getValue(I.getArgOperand(0));
+    SDValue B = getValue(I.getArgOperand(1));
+    SDValue C = getValue(I.getArgOperand(2));
+
+    Type *Ty = I.getType();
+    EVT VT = TLI.getValueType(DAG.getDataLayout(), Ty);
+    if (Ty->isHalfTy() && !TLI.isOperationLegalOrCustom(ISD::FMA, VT)) {
+      // An f16 fma must go via f64 to prevent double rounding issues.
+
+      EVT HalfVT = VT;
+      EVT DoubleVT = MVT::f64;
+
+      SDValue A64 = DAG.getNode(ISD::FP_EXTEND, sdl, DoubleVT, A);
+      SDValue B64 = DAG.getNode(ISD::FP_EXTEND, sdl, DoubleVT, B);
+      SDValue C64 = DAG.getNode(ISD::FP_EXTEND, sdl, DoubleVT, C);
+
+      // Prefer FMA in double if the target has it (optimizes better).
+      SDValue Fma64;
+      if (TLI.isOperationLegalOrCustom(ISD::FMA, DoubleVT)) {
+        Fma64 = DAG.getNode(ISD::FMA, sdl, DoubleVT, A64, B64, C64, Flags);
+      } else {
+        SDValue Mul = DAG.getNode(ISD::FMUL, sdl, DoubleVT, A64, B64, Flags);
+        Fma64 = DAG.getNode(ISD::FADD, sdl, DoubleVT, Mul, C64, Flags);
+      }
+
+      SDValue ResHalf =
+          DAG.getNode(ISD::FP_ROUND, sdl, HalfVT, Fma64,
+                      DAG.getIntPtrConstant(0, sdl, /*isTarget=*/true));
+
+      setValue(&I, ResHalf);
+    } else {
+      setValue(&I,
+               DAG.getNode(ISD::FMA, sdl, A.getValueType(), A, B, C, Flags));
+    }
     return;
+  }
 #define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)                         \
   case Intrinsic::INTRINSIC:
 #include "llvm/IR/ConstrainedOps.def"
diff --git a/llvm/test/CodeGen/ARM/fp16-promote.ll b/llvm/test/CodeGen/ARM/fp16-promote.ll
index 8230e47259dd8..27a0bf2eb9037 100644
--- a/llvm/test/CodeGen/ARM/fp16-promote.ll
+++ b/llvm/test/CodeGen/ARM/fp16-promote.ll
@@ -1508,61 +1508,81 @@ define void @test_fma(ptr %p, ptr %q, ptr %r) #0 {
 ; CHECK-FP16-NEXT:    push {r4, lr}
 ; CHECK-FP16-NEXT:    mov r4, r0
 ; CHECK-FP16-NEXT:    ldrh r0, [r1]
-; CHECK-FP16-NEXT:    ldrh r1, [r4]
-; CHECK-FP16-NEXT:    ldrh r2, [r2]
-; CHECK-FP16-NEXT:    vmov s2, r0
+; CHECK-FP16-NEXT:    ldrh r1, [r2]
+; CHECK-FP16-NEXT:    vmov s0, r0
+; CHECK-FP16-NEXT:    ldrh r0, [r4]
+; CHECK-FP16-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-FP16-NEXT:    vcvt.f64.f32 d16, s0
+; CHECK-FP16-NEXT:    vmov s0, r0
+; CHECK-FP16-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-FP16-NEXT:    vcvt.f64.f32 d17, s0
 ; CHECK-FP16-NEXT:    vmov s0, r1
-; CHECK-FP16-NEXT:    vcvtb.f32.f16 s1, s2
-; CHECK-FP16-NEXT:    vmov s2, r2
 ; CHECK-FP16-NEXT:    vcvtb.f32.f16 s0, s0
-; CHECK-FP16-NEXT:    vcvtb.f32.f16 s2, s2
-; CHECK-FP16-NEXT:    bl fmaf
-; CHECK-FP16-NEXT:    vcvtb.f16.f32 s0, s0
-; CHECK-FP16-NEXT:    vmov r0, s0
+; CHECK-FP16-NEXT:    vcvt.f64.f32 d18, s0
+; CHECK-FP16-NEXT:    vmla.f64 d18, d17, d16
+; CHECK-FP16-NEXT:    vmov r0, r1, d18
+; CHECK-FP16-NEXT:    bl __aeabi_d2h
 ; CHECK-FP16-NEXT:    strh r0, [r4]
 ; CHECK-FP16-NEXT:    pop {r4, pc}
 ;
 ; CHECK-LIBCALL-VFP-LABEL: test_fma:
 ; CHECK-LIBCALL-VFP:         .save {r4, r5, r6, lr}
 ; CHECK-LIBCALL-VFP-NEXT:    push {r4, r5, r6, lr}
+; CHECK-LIBCALL-VFP-NEXT:    .vsave {d8, d9}
+; CHECK-LIBCALL-VFP-NEXT:    vpush {d8, d9}
 ; CHECK-LIBCALL-VFP-NEXT:    mov r4, r0
-; CHECK-LIBCALL-VFP-NEXT:    ldrh r0, [r2]
-; CHECK-LIBCALL-VFP-NEXT:    mov r5, r1
+; CHECK-LIBCALL-VFP-NEXT:    ldrh r0, [r0]
+; CHECK-LIBCALL-VFP-NEXT:    mov r5, r2
+; CHECK-LIBCALL-VFP-NEXT:    mov r6, r1
 ; CHECK-LIBCALL-VFP-NEXT:    bl __aeabi_h2f
-; CHECK-LIBCALL-VFP-NEXT:    mov r6, r0
-; CHECK-LIBCALL-VFP-NEXT:    ldrh r0, [r5]
+; CHECK-LIBCALL-VFP-NEXT:    ldrh r1, [r6]
+; CHECK-LIBCALL-VFP-NEXT:    vmov s16, r0
+; CHECK-LIBCALL-VFP-NEXT:    ldrh r5, [r5]
+; CHECK-LIBCALL-VFP-NEXT:    mov r0, r1
 ; CHECK-LIBCALL-VFP-NEXT:    bl __aeabi_h2f
-; CHECK-LIBCALL-VFP-NEXT:    mov r5, r0
-; CHECK-LIBCALL-VFP-NEXT:    ldrh r0, [r4]
+; CHECK-LIBCALL-VFP-NEXT:    vmov s18, r0
+; CHECK-LIBCALL-VFP-NEXT:    mov r0, r5
 ; CHECK-LIBCALL-VFP-NEXT:    bl __aeabi_h2f
 ; CHECK-LIBCALL-VFP-NEXT:    vmov s0, r0
-; CHECK-LIBCALL-VFP-NEXT:    vmov s1, r5
-; CHECK-LIBCALL-VFP-NEXT:    vmov s2, r6
-; CHECK-LIBCALL-VFP-NEXT:    bl fmaf
-; CHECK-LIBCALL-VFP-NEXT:    vmov r0, s0
-; CHECK-LIBCALL-VFP-NEXT:    bl __aeabi_f2h
+; CHECK-LIBCALL-VFP-NEXT:    vcvt.f64.f32 d16, s18
+; CHECK-LIBCALL-VFP-NEXT:    vcvt.f64.f32 d17, s16
+; CHECK-LIBCALL-VFP-NEXT:    vcvt.f64.f32 d18, s0
+; CHECK-LIBCALL-VFP-NEXT:    vmla.f64 d18, d17, d16
+; CHECK-LIBCALL-VFP-NEXT:    vmov r0, r1, d18
+; CHECK-LIBCALL-VFP-NEXT:    bl __aeabi_d2h
 ; CHECK-LIBCALL-VFP-NEXT:    strh r0, [r4]
+; CHECK-LIBCALL-VFP-NEXT:    vpop {d8, d9}
 ; CHECK-LIBCALL-VFP-NEXT:    pop {r4, r5, r6, pc}
 ;
 ; CHECK-NOVFP-LABEL: test_fma:
-; CHECK-NOVFP:         .save {r4, r5, r6, lr}
-; CHECK-NOVFP-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NOVFP:         .save {r4, r5, r6, r7, r11, lr}
+; CHECK-NOVFP-NEXT:    push {r4, r5, r6, r7, r11, lr}
 ; CHECK-NOVFP-NEXT:    mov r4, r0
 ; CHECK-NOVFP-NEXT:    ldrh r0, [r1]
 ; CHECK-NOVFP-NEXT:    mov r5, r2
 ; CHECK-NOVFP-NEXT:    bl __aeabi_h2f
+; CHECK-NOVFP-NEXT:    bl __aeabi_f2d
 ; CHECK-NOVFP-NEXT:    mov r6, r0
-; CHECK-NOVFP-NEXT:    ldrh r0, [r5]
-; CHECK-NOVFP-NEXT:    bl __aeabi_h2f
-; CHECK-NOVFP-NEXT:    mov r5, r0
 ; CHECK-NOVFP-NEXT:    ldrh r0, [r4]
+; CHECK-NOVFP-NEXT:    mov r7, r1
 ; CHECK-NOVFP-NEXT:    bl __aeabi_h2f
-; CHECK-NOVFP-NEXT:    mov r1, r6
-; CHECK-NOVFP-NEXT:    mov r2, r5
-; CHECK-NOVFP-NEXT:    bl fmaf
-; CHECK-NOVFP-NEXT:    bl __aeabi_f2h
+; CHECK-NOVFP-NEXT:    bl __aeabi_f2d
+; CHECK-NOVFP-NEXT:    mov r2, r6
+; CHECK-NOVFP-NEXT:    mov r3, r7
+; CHECK-NOVFP-NEXT:    bl __aeabi_dmul
+; CHECK-NOVFP-NEXT:    mov r6, r0
+; CHECK-NOVFP-NEXT:    ldrh r0, [r5]
+; CHECK-NOVFP-NEXT:    mov r7, r1
+; CHECK-NOVFP-NEXT:    bl __aeabi_h2f
+; CHECK-NOVFP-NEXT:    bl __aeabi_f2d
+; CHECK-NOVFP-NEXT:    mov r2, r0
+; CHECK-NOVFP-NEXT:    mov r3, r1
+; CHECK-NOVFP-NEXT:    mov r0, r6
+; CHECK-NOVFP-NEXT:    mov r1, r7
+; CHECK-NOVFP-NEXT:    bl __aeabi_dadd
+; CHECK-NOVFP-NEXT:    bl __aeabi_d2h
 ; CHECK-NOVFP-NEXT:    strh r0, [r4]
-; CHECK-NOVFP-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NOVFP-NEXT:    pop {r4, r5, r6, r7, r11, pc}
   %a = load half, ptr %p, align 2
   %b = load half, ptr %q, align 2
   %c = load half, ptr %r, align 2
diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-06.ll b/llvm/test/CodeGen/SystemZ/fp-mul-06.ll
index 6b285a49057dc..f46fb92c59ba5 100644
--- a/llvm/test/CodeGen/SystemZ/fp-mul-06.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-mul-06.ll
@@ -5,15 +5,16 @@
 
 declare half @llvm.fma.f16(half %f1, half %f2, half %f3)
 declare float @llvm.fma.f32(float %f1, float %f2, float %f3)
+declare double @llvm.fma.f64(double %f1, double %f2, double %f3)
 
 define half @f0(half %f1, half %f2, half %acc) {
 ; CHECK-LABEL: f0:
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK-SCALAR: maebr %f0, %f9, %f10
-; CHECK-VECTOR: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK-SCALAR: madbr %f0, %f9, %f10
+; CHECK-VECTOR: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2@PLT
 ; CHECK: br %r14
   %res = call half @llvm.fma.f16 (half %f1, half %f2, half %acc)
   ret half %res
@@ -29,6 +30,16 @@ define float @f1(float %f1, float %f2, float %acc) {
   ret float %res
 }
 
+define double @f9(double %f1, double %f2, double %acc) {
+; CHECK-LABEL: f9:
+; CHECK-SCALAR: madbr %f4, %f0, %f2
+; CHECK-SCALAR: ldr %f0, %f4
+; CHECK-VECTOR: wfmadb %f0, %f0, %f2, %f4
+; CHECK: br %r14
+  %res = call double @llvm.fma.f64 (double %f1, double %f2, double %acc)
+  ret double %res
+}
+
 define float @f2(float %f1, ptr %ptr, float %acc) {
 ; CHECK-LABEL: f2:
 ; CHECK: maeb %f2, %f0, 0(%r2)
diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-08.ll b/llvm/test/CodeGen/SystemZ/fp-mul-08.ll
index e739bddd4f18f..542cae41d4745 100644
--- a/llvm/test/CodeGen/SystemZ/fp-mul-08.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-mul-08.ll
@@ -10,12 +10,12 @@ define half @f0(half %f1, half %f2, half %acc) {
 ; CHECK-LABEL: f0:
 ; CHECK-NOT: brasl
 ; CHECK: lcdfr %f{{[0-9]+}}, %f4
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK-SCALAR: maebr %f0, %f8, %f10
-; CHECK-VECTOR: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK-SCALAR: madbr %f0, %f8, %f10
+; CHECK-VECTOR: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2@PLT
 ; CHECK: br %r14
   %negacc = fneg half %acc
   %res = call half @llvm.fma.f16 (half %f1, half %f2, half %negacc)
diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-10.ll b/llvm/test/CodeGen/SystemZ/fp-mul-10.ll
index 8f2cd23112cd0..0badf2993cca7 100644
--- a/llvm/test/CodeGen/SystemZ/fp-mul-10.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-mul-10.ll
@@ -25,11 +25,11 @@ define double @f2(double %f1, double %f2, double %acc) {
 
 define half @f3_half(half %f1, half %f2, half %acc) {
 ; CHECK-LABEL: f3_half:
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2@PLT
 ; CHECK-NOT: brasl
 ; CHECK:      lcdfr %f0, %f0
 ; CHECK-NEXT: lmg
@@ -52,11 +52,11 @@ define half @f4_half(half %f1, half %f2, half %acc) {
 ; CHECK-LABEL: f4_half:
 ; CHECK-NOT: brasl
 ; CHECK: lcdfr %f0, %f4
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2@PLT
 ; CHECK-NOT: brasl
 ; CHECK:      lcdfr %f0, %f0
 ; CHECK-NEXT: lmg

llvmbot · 2025-12-11T21:05:08Z

@llvm/pr-subscribers-backend-arm

Author: Folkert de Vries (folkertdev)

Changes

fixes #98389

As the issue describes, promoting llvm.fma.f16 to llvm.fma.f32 does not work, because there is not enough precision to handle the repeated rounding. f64 does have sufficient space. So this PR explicitly promotes the 16-bit fma to a 64-bit fma.

I could not find examples of a libcall being used for fma, but that's something that could be looked in separately to work around code size issues.

Full diff: https://github.com/llvm/llvm-project/pull/171904.diff

5 Files Affected:

(modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp (+36-5)
(modified) llvm/test/CodeGen/ARM/fp16-promote.ll (+50-30)
(modified) llvm/test/CodeGen/SystemZ/fp-mul-06.ll (+17-6)
(modified) llvm/test/CodeGen/SystemZ/fp-mul-08.ll (+6-6)
(modified) llvm/test/CodeGen/SystemZ/fp-mul-10.ll (+10-10)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index dcf2df305d24a..4c74cc9ebe061 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6919,12 +6919,43 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                              getValue(I.getArgOperand(0)), Flags));
     return;
   }
-  case Intrinsic::fma:
-    setValue(&I, DAG.getNode(
-                     ISD::FMA, sdl, getValue(I.getArgOperand(0)).getValueType(),
-                     getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)),
-                     getValue(I.getArgOperand(2)), Flags));
+  case Intrinsic::fma: {
+    SDValue A = getValue(I.getArgOperand(0));
+    SDValue B = getValue(I.getArgOperand(1));
+    SDValue C = getValue(I.getArgOperand(2));
+
+    Type *Ty = I.getType();
+    EVT VT = TLI.getValueType(DAG.getDataLayout(), Ty);
+    if (Ty->isHalfTy() && !TLI.isOperationLegalOrCustom(ISD::FMA, VT)) {
+      // An f16 fma must go via f64 to prevent double rounding issues.
+
+      EVT HalfVT = VT;
+      EVT DoubleVT = MVT::f64;
+
+      SDValue A64 = DAG.getNode(ISD::FP_EXTEND, sdl, DoubleVT, A);
+      SDValue B64 = DAG.getNode(ISD::FP_EXTEND, sdl, DoubleVT, B);
+      SDValue C64 = DAG.getNode(ISD::FP_EXTEND, sdl, DoubleVT, C);
+
+      // Prefer FMA in double if the target has it (optimizes better).
+      SDValue Fma64;
+      if (TLI.isOperationLegalOrCustom(ISD::FMA, DoubleVT)) {
+        Fma64 = DAG.getNode(ISD::FMA, sdl, DoubleVT, A64, B64, C64, Flags);
+      } else {
+        SDValue Mul = DAG.getNode(ISD::FMUL, sdl, DoubleVT, A64, B64, Flags);
+        Fma64 = DAG.getNode(ISD::FADD, sdl, DoubleVT, Mul, C64, Flags);
+      }
+
+      SDValue ResHalf =
+          DAG.getNode(ISD::FP_ROUND, sdl, HalfVT, Fma64,
+                      DAG.getIntPtrConstant(0, sdl, /*isTarget=*/true));
+
+      setValue(&I, ResHalf);
+    } else {
+      setValue(&I,
+               DAG.getNode(ISD::FMA, sdl, A.getValueType(), A, B, C, Flags));
+    }
     return;
+  }
 #define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)                         \
   case Intrinsic::INTRINSIC:
 #include "llvm/IR/ConstrainedOps.def"
diff --git a/llvm/test/CodeGen/ARM/fp16-promote.ll b/llvm/test/CodeGen/ARM/fp16-promote.ll
index 8230e47259dd8..27a0bf2eb9037 100644
--- a/llvm/test/CodeGen/ARM/fp16-promote.ll
+++ b/llvm/test/CodeGen/ARM/fp16-promote.ll
@@ -1508,61 +1508,81 @@ define void @test_fma(ptr %p, ptr %q, ptr %r) #0 {
 ; CHECK-FP16-NEXT:    push {r4, lr}
 ; CHECK-FP16-NEXT:    mov r4, r0
 ; CHECK-FP16-NEXT:    ldrh r0, [r1]
-; CHECK-FP16-NEXT:    ldrh r1, [r4]
-; CHECK-FP16-NEXT:    ldrh r2, [r2]
-; CHECK-FP16-NEXT:    vmov s2, r0
+; CHECK-FP16-NEXT:    ldrh r1, [r2]
+; CHECK-FP16-NEXT:    vmov s0, r0
+; CHECK-FP16-NEXT:    ldrh r0, [r4]
+; CHECK-FP16-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-FP16-NEXT:    vcvt.f64.f32 d16, s0
+; CHECK-FP16-NEXT:    vmov s0, r0
+; CHECK-FP16-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-FP16-NEXT:    vcvt.f64.f32 d17, s0
 ; CHECK-FP16-NEXT:    vmov s0, r1
-; CHECK-FP16-NEXT:    vcvtb.f32.f16 s1, s2
-; CHECK-FP16-NEXT:    vmov s2, r2
 ; CHECK-FP16-NEXT:    vcvtb.f32.f16 s0, s0
-; CHECK-FP16-NEXT:    vcvtb.f32.f16 s2, s2
-; CHECK-FP16-NEXT:    bl fmaf
-; CHECK-FP16-NEXT:    vcvtb.f16.f32 s0, s0
-; CHECK-FP16-NEXT:    vmov r0, s0
+; CHECK-FP16-NEXT:    vcvt.f64.f32 d18, s0
+; CHECK-FP16-NEXT:    vmla.f64 d18, d17, d16
+; CHECK-FP16-NEXT:    vmov r0, r1, d18
+; CHECK-FP16-NEXT:    bl __aeabi_d2h
 ; CHECK-FP16-NEXT:    strh r0, [r4]
 ; CHECK-FP16-NEXT:    pop {r4, pc}
 ;
 ; CHECK-LIBCALL-VFP-LABEL: test_fma:
 ; CHECK-LIBCALL-VFP:         .save {r4, r5, r6, lr}
 ; CHECK-LIBCALL-VFP-NEXT:    push {r4, r5, r6, lr}
+; CHECK-LIBCALL-VFP-NEXT:    .vsave {d8, d9}
+; CHECK-LIBCALL-VFP-NEXT:    vpush {d8, d9}
 ; CHECK-LIBCALL-VFP-NEXT:    mov r4, r0
-; CHECK-LIBCALL-VFP-NEXT:    ldrh r0, [r2]
-; CHECK-LIBCALL-VFP-NEXT:    mov r5, r1
+; CHECK-LIBCALL-VFP-NEXT:    ldrh r0, [r0]
+; CHECK-LIBCALL-VFP-NEXT:    mov r5, r2
+; CHECK-LIBCALL-VFP-NEXT:    mov r6, r1
 ; CHECK-LIBCALL-VFP-NEXT:    bl __aeabi_h2f
-; CHECK-LIBCALL-VFP-NEXT:    mov r6, r0
-; CHECK-LIBCALL-VFP-NEXT:    ldrh r0, [r5]
+; CHECK-LIBCALL-VFP-NEXT:    ldrh r1, [r6]
+; CHECK-LIBCALL-VFP-NEXT:    vmov s16, r0
+; CHECK-LIBCALL-VFP-NEXT:    ldrh r5, [r5]
+; CHECK-LIBCALL-VFP-NEXT:    mov r0, r1
 ; CHECK-LIBCALL-VFP-NEXT:    bl __aeabi_h2f
-; CHECK-LIBCALL-VFP-NEXT:    mov r5, r0
-; CHECK-LIBCALL-VFP-NEXT:    ldrh r0, [r4]
+; CHECK-LIBCALL-VFP-NEXT:    vmov s18, r0
+; CHECK-LIBCALL-VFP-NEXT:    mov r0, r5
 ; CHECK-LIBCALL-VFP-NEXT:    bl __aeabi_h2f
 ; CHECK-LIBCALL-VFP-NEXT:    vmov s0, r0
-; CHECK-LIBCALL-VFP-NEXT:    vmov s1, r5
-; CHECK-LIBCALL-VFP-NEXT:    vmov s2, r6
-; CHECK-LIBCALL-VFP-NEXT:    bl fmaf
-; CHECK-LIBCALL-VFP-NEXT:    vmov r0, s0
-; CHECK-LIBCALL-VFP-NEXT:    bl __aeabi_f2h
+; CHECK-LIBCALL-VFP-NEXT:    vcvt.f64.f32 d16, s18
+; CHECK-LIBCALL-VFP-NEXT:    vcvt.f64.f32 d17, s16
+; CHECK-LIBCALL-VFP-NEXT:    vcvt.f64.f32 d18, s0
+; CHECK-LIBCALL-VFP-NEXT:    vmla.f64 d18, d17, d16
+; CHECK-LIBCALL-VFP-NEXT:    vmov r0, r1, d18
+; CHECK-LIBCALL-VFP-NEXT:    bl __aeabi_d2h
 ; CHECK-LIBCALL-VFP-NEXT:    strh r0, [r4]
+; CHECK-LIBCALL-VFP-NEXT:    vpop {d8, d9}
 ; CHECK-LIBCALL-VFP-NEXT:    pop {r4, r5, r6, pc}
 ;
 ; CHECK-NOVFP-LABEL: test_fma:
-; CHECK-NOVFP:         .save {r4, r5, r6, lr}
-; CHECK-NOVFP-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NOVFP:         .save {r4, r5, r6, r7, r11, lr}
+; CHECK-NOVFP-NEXT:    push {r4, r5, r6, r7, r11, lr}
 ; CHECK-NOVFP-NEXT:    mov r4, r0
 ; CHECK-NOVFP-NEXT:    ldrh r0, [r1]
 ; CHECK-NOVFP-NEXT:    mov r5, r2
 ; CHECK-NOVFP-NEXT:    bl __aeabi_h2f
+; CHECK-NOVFP-NEXT:    bl __aeabi_f2d
 ; CHECK-NOVFP-NEXT:    mov r6, r0
-; CHECK-NOVFP-NEXT:    ldrh r0, [r5]
-; CHECK-NOVFP-NEXT:    bl __aeabi_h2f
-; CHECK-NOVFP-NEXT:    mov r5, r0
 ; CHECK-NOVFP-NEXT:    ldrh r0, [r4]
+; CHECK-NOVFP-NEXT:    mov r7, r1
 ; CHECK-NOVFP-NEXT:    bl __aeabi_h2f
-; CHECK-NOVFP-NEXT:    mov r1, r6
-; CHECK-NOVFP-NEXT:    mov r2, r5
-; CHECK-NOVFP-NEXT:    bl fmaf
-; CHECK-NOVFP-NEXT:    bl __aeabi_f2h
+; CHECK-NOVFP-NEXT:    bl __aeabi_f2d
+; CHECK-NOVFP-NEXT:    mov r2, r6
+; CHECK-NOVFP-NEXT:    mov r3, r7
+; CHECK-NOVFP-NEXT:    bl __aeabi_dmul
+; CHECK-NOVFP-NEXT:    mov r6, r0
+; CHECK-NOVFP-NEXT:    ldrh r0, [r5]
+; CHECK-NOVFP-NEXT:    mov r7, r1
+; CHECK-NOVFP-NEXT:    bl __aeabi_h2f
+; CHECK-NOVFP-NEXT:    bl __aeabi_f2d
+; CHECK-NOVFP-NEXT:    mov r2, r0
+; CHECK-NOVFP-NEXT:    mov r3, r1
+; CHECK-NOVFP-NEXT:    mov r0, r6
+; CHECK-NOVFP-NEXT:    mov r1, r7
+; CHECK-NOVFP-NEXT:    bl __aeabi_dadd
+; CHECK-NOVFP-NEXT:    bl __aeabi_d2h
 ; CHECK-NOVFP-NEXT:    strh r0, [r4]
-; CHECK-NOVFP-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NOVFP-NEXT:    pop {r4, r5, r6, r7, r11, pc}
   %a = load half, ptr %p, align 2
   %b = load half, ptr %q, align 2
   %c = load half, ptr %r, align 2
diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-06.ll b/llvm/test/CodeGen/SystemZ/fp-mul-06.ll
index 6b285a49057dc..f46fb92c59ba5 100644
--- a/llvm/test/CodeGen/SystemZ/fp-mul-06.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-mul-06.ll
@@ -5,15 +5,16 @@
 
 declare half @llvm.fma.f16(half %f1, half %f2, half %f3)
 declare float @llvm.fma.f32(float %f1, float %f2, float %f3)
+declare double @llvm.fma.f64(double %f1, double %f2, double %f3)
 
 define half @f0(half %f1, half %f2, half %acc) {
 ; CHECK-LABEL: f0:
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK-SCALAR: maebr %f0, %f9, %f10
-; CHECK-VECTOR: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK-SCALAR: madbr %f0, %f9, %f10
+; CHECK-VECTOR: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2@PLT
 ; CHECK: br %r14
   %res = call half @llvm.fma.f16 (half %f1, half %f2, half %acc)
   ret half %res
@@ -29,6 +30,16 @@ define float @f1(float %f1, float %f2, float %acc) {
   ret float %res
 }
 
+define double @f9(double %f1, double %f2, double %acc) {
+; CHECK-LABEL: f9:
+; CHECK-SCALAR: madbr %f4, %f0, %f2
+; CHECK-SCALAR: ldr %f0, %f4
+; CHECK-VECTOR: wfmadb %f0, %f0, %f2, %f4
+; CHECK: br %r14
+  %res = call double @llvm.fma.f64 (double %f1, double %f2, double %acc)
+  ret double %res
+}
+
 define float @f2(float %f1, ptr %ptr, float %acc) {
 ; CHECK-LABEL: f2:
 ; CHECK: maeb %f2, %f0, 0(%r2)
diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-08.ll b/llvm/test/CodeGen/SystemZ/fp-mul-08.ll
index e739bddd4f18f..542cae41d4745 100644
--- a/llvm/test/CodeGen/SystemZ/fp-mul-08.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-mul-08.ll
@@ -10,12 +10,12 @@ define half @f0(half %f1, half %f2, half %acc) {
 ; CHECK-LABEL: f0:
 ; CHECK-NOT: brasl
 ; CHECK: lcdfr %f{{[0-9]+}}, %f4
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK-SCALAR: maebr %f0, %f8, %f10
-; CHECK-VECTOR: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK-SCALAR: madbr %f0, %f8, %f10
+; CHECK-VECTOR: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2@PLT
 ; CHECK: br %r14
   %negacc = fneg half %acc
   %res = call half @llvm.fma.f16 (half %f1, half %f2, half %negacc)
diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-10.ll b/llvm/test/CodeGen/SystemZ/fp-mul-10.ll
index 8f2cd23112cd0..0badf2993cca7 100644
--- a/llvm/test/CodeGen/SystemZ/fp-mul-10.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-mul-10.ll
@@ -25,11 +25,11 @@ define double @f2(double %f1, double %f2, double %acc) {
 
 define half @f3_half(half %f1, half %f2, half %acc) {
 ; CHECK-LABEL: f3_half:
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2@PLT
 ; CHECK-NOT: brasl
 ; CHECK:      lcdfr %f0, %f0
 ; CHECK-NEXT: lmg
@@ -52,11 +52,11 @@ define half @f4_half(half %f1, half %f2, half %acc) {
 ; CHECK-LABEL: f4_half:
 ; CHECK-NOT: brasl
 ; CHECK: lcdfr %f0, %f4
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2@PLT
 ; CHECK-NOT: brasl
 ; CHECK:      lcdfr %f0, %f0
 ; CHECK-NEXT: lmg

tgross35 · 2025-12-11T21:15:16Z

I could not find examples of a libcall being used for fma, but that's something that could be looked in separately to work around code size issues.

I think this is pretty triple-dependent, e.g. glib and LLVM’s libraries have them but not much else does. I think what would be nice one day is for a frontend to have a way to indicate that a full C23 math library is available (with fmaf16, fmaf32, fmaf64, fmaf128 and similar available rather than just fma/fmaf/fmal), which would help with this plus a few other things (I’ve mentioned something similar to this to @arsenm before).

But for now, unconditionally using f64 ops is definitely easiest.

github-actions · 2025-12-11T21:37:40Z

🪟 Windows x64 Test Results

128705 tests passed
2824 tests skipped

✅ The build succeeded and all tests passed.

github-actions · 2025-12-11T21:37:40Z

🐧 Linux x64 Test Results

187555 tests passed
4968 tests skipped

✅ The build succeeded and all tests passed.

arsenm

This cannot go in SelectionDAGBuilder. This needs to be handled in LegalizeFloatTypes. In particular this will not catch FMAs later introduced during combine / legalize. It also doesn't remove the broken path, so those will still hit that

llvm/test/CodeGen/SystemZ/fp-mul-06.ll

folkertdev · 2025-12-11T22:55:38Z

This cannot go in SelectionDAGBuilder. This needs to be handled in LegalizeFloatTypes. In particular this will not catch FMAs later introduced during combine / legalize. It also doesn't remove the broken path, so those will still hit that

Hmm I see the problem, but conceptually, how would that work? I don't see any such custom logic for f16 in LegalizeFloatTypes.cpp currently. E.g. here

SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FMAD(SDNode *N) {
  EVT OVT = N->getValueType(0);
  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
  SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0));
  SDValue Op1 = GetSoftPromotedHalf(N->getOperand(1));
  SDValue Op2 = GetSoftPromotedHalf(N->getOperand(2));
  SDLoc dl(N);

  // Promote to the larger FP type.
  auto PromotionOpcode = GetPromotionOpcode(OVT, NVT);
  Op0 = DAG.getNode(PromotionOpcode, dl, NVT, Op0);
  Op1 = DAG.getNode(PromotionOpcode, dl, NVT, Op1);
  Op2 = DAG.getNode(PromotionOpcode, dl, NVT, Op2);

  SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1, Op2);

  // Convert back to FP16 as an integer.
  return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res);
}

the type to promote to is apparently already known, and just picking something else probably won't go well?

arsenm · 2025-12-11T23:42:59Z

The easiest thing to do would be to take the default float promoted values, insert your own fp_extend from f32 to f64, and then insert your own truncate to the legal f32 for the result

arsenm · 2025-12-12T10:19:44Z

I think what would be nice one day is for a frontend to have a way to indicate that a full C23 math library is available

This should be implied by the triple. Nothing is stopping you from just adding these functions to RuntimeLibcallsInfo today

folkertdev · 2025-12-12T12:16:37Z

I made the changes to SoftPromoteHalfRes_FMAD, and they help in some cases, but not in all of them. I'm missing something, and I'm not sure what that would be.

ExpandFloatRes_FMA and SoftenFloatRes_FMA already emit libcalls, so they don't seem relevant. Changing PromoteFloatRes_FMAD does not seem to have an effect.

uweigand · 2025-12-12T12:27:59Z

I made the changes to SoftPromoteHalfRes_FMAD, and they help in some cases, but not in all of them. I'm missing something, and I'm not sure what that would be.

For SystemZ, we're not using SoftPromoteHalf, but a regular Promote operation.

ExpandFloatRes_FMA and SoftenFloatRes_FMA already emit libcalls, so they don't seem relevant. Changing PromoteFloatRes_FMAD does not seem to have an effect.

That's where I thought we should end up ...

llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp

arsenm · 2025-12-12T12:58:14Z

llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp

What about bf16? Can you express this in terms of the fundamental properties of the fltSemantics?

I'm not sure, based on #131531 the bf16 variant should be a libcall unconditionally.

Libcall is a mechanical property, it doesn't change the type or rounding issues. The mixing of the libcall emission into type legalization for the soft float case is an unfortunate DAG-ism

huh, doesn't picking the bf16 fma libcall do the right thing? or are you saying that currently it would pick the f32 libcall (with associated promotion/rounding)?

In any case even promoting to f64 will not give accurate results for bf16, so I'm not sure what to do?

There is currently no listing or knowledge of bf16 libcalls in llvm. I wouldn't be surprised ifthe promotion of bf16 on targets without legal float or need soft float is probably untested

Well then I'm not sure what your suggestion is here? I don't see how we can implement anything valid for bf16?

In any case even promoting to f64 will not give accurate results for bf16

That surprises me, really? If we don't already have tests relying on it, I'd probably return poison and emitError

llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp

arsenm · 2025-12-12T13:05:50Z

ExpandFloatRes_FMA and SoftenFloatRes_FMA already emit libcalls, so they don't seem relevant.

Those don't know about f16 libcalls, so those probably error if reached.

folkertdev · 2025-12-12T13:35:23Z

Those don't know about f16 libcalls, so those probably error if reached.

Is that a problem? should they know about the f16 libcalls? adding an extra argument to GetFPLibCall is going to be kind of invasive.

arsenm · 2025-12-12T13:40:45Z

Those don't know about f16 libcalls, so those probably error if reached.

Is that a problem? should they know about the f16 libcalls? adding an extra argument to GetFPLibCall is going to be kind of invasive.

Yes. I want tablegen to start generating all the get* libcall helper functions to assist this

folkertdev · 2025-12-12T14:30:56Z

Those don't know about f16 libcalls, so those probably error if reached.

Is that a problem? should they know about the f16 libcalls? adding an extra argument to GetFPLibCall is going to be kind of invasive.

Yes. I want tablegen to start generating all the get* libcall helper functions to assist this

Sure, anything I should do here now though?

folkertdev · 2025-12-12T20:10:09Z

the test failures are kind of annoying

the Generic tests sort of require all backends, and I don't really want to build everything. The AMDGPU one succeeds locally, but also its name suggests it deals with bf16, so maybe it actually should not change at all?

tgross35 · 2025-12-13T08:41:59Z

the test failures are kind of annoying

the Generic tests sort of require all backends, and I don't really want to build everything. The AMDGPU one succeeds locally, but also its name suggests it deals with bf16, so maybe it actually should not change at all?

The failing check in half-op.ll is an xfail for the issue that this fixes, so I think you should just be able to replace BAD-FMA with CHECK-FMA in all the RUN instructions and almost all targets should pass.

That will still fail if there are targets that do f64->f32->f16 rather than f64->f16. Maybe replace BAD-FMA with SKIP-FMA in these cases?

arsenm · 2025-12-13T10:29:20Z

the test failures are kind of annoying

the Generic tests sort of require all backends, and I don't really want to build everything. The AMDGPU one succeeds locally, but also its name suggests it deals with bf16, so maybe it actually should not change at all?

Don't add tests to Generic. There's not really such thing as a "generic" codegen test; it's just a source of build pain. There's always a target dependency

In this case the prior art is the sequence of %ifd run lines, so I guess keep going with that

folkertdev · 2025-12-13T13:46:04Z

I don't want to add to those tests, but they fail because on some platforms the output is now different. So the output needs to be updated, but the standard llvm-lit won't even run this test.

uweigand · 2025-12-15T13:05:50Z

I'm not sure if makes sense to change the PromoteFloatRes path. That legalization strategy is known to produce incorrectly rounded results for half promotion, which is why backends are being migrated to SoftPromoteHalf. So I'd expect any backends that are still broken to be fixed once that happens.

I was not aware there was a requirement for all targets to use SoftPromoteHalf? We've chosen to not use this on SystemZ because it uses i16 everywhere which would appear to make implementing the proper f16 ABI (to be compatible with any potential future hardware implementation) more difficult.

What specifically are the problems SoftPromoteHalf is supposed to address? I'm aware of these two:
#97981
#97975
and both of them are working correctly with our current SystemZ implementation (without SoftPromoteHalf).

nikic · 2025-12-15T15:37:03Z

I'm not sure if makes sense to change the PromoteFloatRes path. That legalization strategy is known to produce incorrectly rounded results for half promotion, which is why backends are being migrated to SoftPromoteHalf. So I'd expect any backends that are still broken to be fixed once that happens.

I was not aware there was a requirement for all targets to use SoftPromoteHalf? We've chosen to not use this on SystemZ because it uses i16 everywhere which would appear to make implementing the proper f16 ABI (to be compatible with any potential future hardware implementation) more difficult.

What specifically are the problems SoftPromoteHalf is supposed to address? I'm aware of these two: #97981 #97975 and both of them are working correctly with our current SystemZ implementation (without SoftPromoteHalf).

SystemZ makes f16 a legal type, so the PromoteFloat vs SoftPromoteHalf distinction is not relevant in that case: These are type legalization strategies, not op legalization strategies.

SystemZ still does need to use SoftPromoteHalf in the case where +soft-float is used and f16 is not a legal type. You can see the standard example being miscompiled here: https://llvm.godbolt.org/z/74rE5afYn

To add some additional confusion here, there is the separate "Promote" op legalization action (handled by PromoteNode), which is separate from the type legalization actions, and as far as I can tell it does handle things correctly in general.

The FMA case here is handled by

llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp

Lines 5799 to 5807 in 4190d57

    
           case ISD::FMA: 
        
             Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0)); 
        
             Tmp2 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(1)); 
        
             Tmp3 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(2)); 
        
             Results.push_back( 
        
                 DAG.getNode(ISD::FP_ROUND, dl, OVT, 
        
                             DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2, Tmp3), 
        
                             DAG.getIntPtrConstant(0, dl, /*isTarget=*/true))); 
        
             break;

, which is the code that the AddPromotedToType() calls control. (I kind of wonder whether we should be setting the promoted type for that operation by default...)

uweigand · 2025-12-15T17:11:40Z

SystemZ makes f16 a legal type, so the PromoteFloat vs SoftPromoteHalf distinction is not relevant in that case: These are type legalization strategies, not op legalization strategies.

I see, thanks!

SystemZ still does need to use SoftPromoteHalf in the case where +soft-float is used and f16 is not a legal type. You can see the standard example being miscompiled here: https://llvm.godbolt.org/z/74rE5afYn

Right. We didn't notice that because on SystemZ the only user of +soft-float is the Linux kernel, and they don't actually use any floating-point types, much less f16. But I agree this should be fixed, we'll have a look.

, which is the code that the AddPromotedToType() calls control. (I kind of wonder whether we should be setting the promoted type for that operation by default...)

I think that's what @folkertdev mentions above, that we can fix fma on SystemZ via AddPromotedToType. I don't have a strong opinion on whether this should be platform-specific choice or the default - but it would appear that the current default is simply incorrect (cannot work correctly), so it might make sense to change it.

folkertdev · 2025-12-15T20:11:44Z

llvm/lib/CodeGen/TargetLoweringBase.cpp

would this fail if the target does not have a legal f64? I think that is the riscv error what CI runs into. What is the right thing to do in that case?

Yes, it looks like for RISCV if we have zfhmin without d we have the wonderful combination of legal f16, illegal f64 and no native f16 FMA. I'm not sure either what to do in that case. cc @topperc

Probably we need to emit a fmaf16 libcall in that case, but I don't think the infrastructure for that exists yet. Maybe for now just move this back into the SystemZ backend and open an issue to resolve the remaining RISCV issue separately?

This change is not just for SystemZ, but also AArch64, NVPTX and x86_64. So this bit of config has to be replicated quite a number of times.

I see. I guess in that case you could explicitly set the promoted type to f32 in the RISCV backend instead if +d is not available...

llvm/test/CodeGen/SystemZ/fp-mul-06.ll

folkertdev · 2025-12-16T12:18:04Z

I think the windows test failure is spurious

#171904 (comment)

if so, I think this is ready.

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

folkertdev · 2025-12-17T21:00:38Z

Should anyone else review this? (it's unclear to me where the initiative lies)

bjacob · 2025-12-18T15:25:36Z

Heads up, that this is causing linking errors in a downstream (IREE) in CPU code generation with no native support for f16:
https://gist.github.com/bjacob/f2da084dd681911d32059df82b44c8e5

bjacob · 2025-12-18T15:38:56Z

OK, I understand the failure now but also have thoughts about this PR.

Our immediate issue is that we carry our own bitcode library for builtin functions like f16 conversions, and we are missing __truncdfhf2. That's probably something that we should have, so in that sense the failure is a useful signal.

However, going back to what this PR is doing, as a downstream, I wonder if we really should be promoting f16 to f64. I get that it's the technically correct thing to do, for the reason that you say. But it's also going to be surprising - I wasn't expecting f64 to be involved in f16 arithmetic until I read the rationale in this PR and the linked issue. And some users are going to prefer the slightly incorrect f32 arithmetic as the better trade-off for their application domain --- not every user cares about the last bit of mantissa enough to bother with f64.

I can see that the code checks the subtarget for f64 support, which is good, but my experience shows that this doesn't catch all cases, as besides CPU target support there is also the question of existence of library symbols in downstreams. Moreover, the mere existence of f64 support doesn't imply for all users that it's the trade-off of performance vs correctness that they want.

I feel that there should at least be an opt-out here.

arsenm · 2025-12-18T17:20:17Z

Our immediate issue is that we carry our own bitcode library for builtin functions like f16 conversions, and we are missing __truncdfhf2. That's probably something that we should have, so in that sense the failure is a useful signal.

The compiler can't respect a random DIY ABI. If you're pretending to be compatible with the triple, you have to provide the full ABI.

I feel that there should at least be an opt-out here.

Wrong is wrong. The legalization has the option of using the afn flag to keep promoting to f32, but that cannot be mandatory

bjacob · 2025-12-18T18:02:53Z

Point taken about the ABI argument. I think though that my other point about the tradeoff of correctness vs the various downsides of involving f64 in f16 arithmetic, still stand. I don't know a simple solution though. The subtarget would need to have finer controls than the current boolean yes/no f64 support.

arsenm · 2025-12-19T18:53:20Z

Point taken about the ABI argument.

It's a bit of a moot point in this case. We don't have a compiler implementation of FMA to fall back on, so the alternative is error

I think though that my other point about the tradeoff of correctness vs the various downsides of involving f64 in f16 arithmetic, still stand. I don't know a simple solution though. The subtarget would need to have finer controls than the current boolean yes/no f64 support.

There shouldn't be a mandatory control to degrade correctness. If you want to force the behavior, you can do your own cast to float. If you just need the optimization hint, this could use the fast math flag.

How off is this result? Is there another fixup sequence that could be applied in float?

bjacob · 2025-12-19T18:58:40Z

Good idea to deal with this on our side by promoting f16 to f32 on targets where f16 isn't natively supported. We should just do that. Thanks.

Also a good idea in principle to condition that on some kind of fast-math flag, but as you know LLVM fast-math flags are their own can of worms that I'm trying not to reopen just before the holidays :-)

…ort (llvm#171904) fixes llvm#98389 As the issue describes, promoting `llvm.fma.f16` to `llvm.fma.f32` does not work, because there is not enough precision to handle the repeated rounding. `f64` does have sufficient space. So this PR explicitly promotes the 16-bit fma to a 64-bit fma. I could not find examples of a libcall being used for fma, but that's something that could be looked in separately to work around code size issues.

nikic · 2025-12-19T20:51:35Z

Good idea to deal with this on our side by promoting f16 to f32 on targets where f16 isn't natively supported. We should just do that. Thanks.

FWIW, Clang also does this kind of promotion, controllable by -fexcess-precision. It avoids the intermediate rounding steps between each operation if you do not care about precisely rounded results.

Integrate llvm/llvm-project@1c023cb This folds Lukas' commits from #22921. Thanks! Additional local change: adding `f16<->f64` conversions and `f64` `fma` to `libdevice` as these are now required by lowerings of `f16` multiply-add to targets without native `f16` support (typically CPU targets). * The symptom that this is fixing, is unresolved symbol errors at linking. * The regressing upstream PR is llvm/llvm-project#171904. * Rationale given there: it's necessary to go to `f64` to implement a perfectly accurate `f16` multiply-add without observable double-rounding. * We don't need to worry too much about the performance here: by definition, this code only happens when we use a data type that is not natively supported in hardware, so users who care about performance should not be on such a path anyway. This preexisting rationale is why the `libdevice` implementation of `f16` conversions favors simplicity over performance. Existing local reverts carried forward: * Local revert of llvm/llvm-project#169614 due #22649. * Also iree-org/llvm-project@dea9ec8 which had landed on top. --------- Signed-off-by: Benoit Jacob <[email protected]> Signed-off-by: Lukas Sommer <[email protected]> Co-authored-by: Lukas Sommer <[email protected]>

llvmbot added backend:ARM backend:SystemZ llvm:SelectionDAG SelectionDAGISel as well labels Dec 11, 2025

arsenm requested changes Dec 11, 2025

View reviewed changes

arsenm added the floating-point Floating-point math label Dec 11, 2025

arsenm reviewed Dec 11, 2025

View reviewed changes

llvm/test/CodeGen/SystemZ/fp-mul-06.ll Outdated Show resolved Hide resolved

llvm/test/CodeGen/SystemZ/fp-mul-06.ll Outdated Show resolved Hide resolved

folkertdev force-pushed the f16-fma branch from d4709ed to 88b9159 Compare December 12, 2025 12:15

arsenm reviewed Dec 12, 2025

View reviewed changes

llvmbot added backend:AMDGPU backend:RISC-V labels Dec 12, 2025

folkertdev force-pushed the f16-fma branch from e33c2dd to 19ee3d5 Compare December 13, 2025 20:11

llvmbot added backend:AArch64 backend:X86 labels Dec 15, 2025

folkertdev force-pushed the f16-fma branch from 163d6d8 to 248537d Compare December 15, 2025 20:09

llvmbot added the backend:NVPTX label Dec 15, 2025

folkertdev commented Dec 15, 2025

View reviewed changes

folkertdev force-pushed the f16-fma branch from 248537d to 3d87567 Compare December 15, 2025 20:52

uweigand reviewed Dec 16, 2025

View reviewed changes

llvm/test/CodeGen/SystemZ/fp-mul-06.ll Outdated Show resolved Hide resolved

folkertdev force-pushed the f16-fma branch 2 times, most recently from 438f8ba to 133aba5 Compare December 16, 2025 10:32

arsenm reviewed Dec 16, 2025

View reviewed changes

llvm/lib/Target/RISCV/RISCVISelLowering.cpp Outdated Show resolved Hide resolved

add AddPromotedToType(ISD::FMA, MVT::f16, MVT::f64);

9cdcb6f

folkertdev force-pushed the f16-fma branch from 133aba5 to 9cdcb6f Compare December 16, 2025 16:15

folkertdev requested a review from arsenm December 17, 2025 16:20

arsenm approved these changes Dec 17, 2025

View reviewed changes

arsenm merged commit a587ccd into llvm:main Dec 17, 2025
10 checks passed

bjacob mentioned this pull request Dec 18, 2025

Integrate LLVM at 1c023cb iree-org/iree#22943

Merged

fix llvm.fma.f16 double rounding issue when there is no native support #171904

fix llvm.fma.f16 double rounding issue when there is no native support #171904

Uh oh!

Conversation

folkertdev commented Dec 11, 2025

Uh oh!

llvmbot commented Dec 11, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Dec 11, 2025

Uh oh!

llvmbot commented Dec 11, 2025

Uh oh!

tgross35 commented Dec 11, 2025

Uh oh!

github-actions bot commented Dec 11, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

🪟 Windows x64 Test Results

Uh oh!

github-actions bot commented Dec 11, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

🐧 Linux x64 Test Results

Uh oh!

arsenm left a comment • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

folkertdev commented Dec 11, 2025

Uh oh!

arsenm commented Dec 11, 2025

Uh oh!

arsenm commented Dec 12, 2025

Uh oh!

folkertdev commented Dec 12, 2025

Uh oh!

uweigand commented Dec 12, 2025

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

arsenm commented Dec 12, 2025

Uh oh!

folkertdev commented Dec 12, 2025

Uh oh!

arsenm commented Dec 12, 2025

Uh oh!

folkertdev commented Dec 12, 2025

Uh oh!

folkertdev commented Dec 12, 2025

Uh oh!

tgross35 commented Dec 13, 2025

Uh oh!

arsenm commented Dec 13, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

folkertdev commented Dec 13, 2025

Uh oh!

uweigand commented Dec 15, 2025

Uh oh!

nikic commented Dec 15, 2025

Uh oh!

uweigand commented Dec 15, 2025

Uh oh!

Choose a reason for hiding this comment

fix `llvm.fma.f16` double rounding issue when there is no native support #171904

fix `llvm.fma.f16` double rounding issue when there is no native support #171904

llvmbot commented Dec 11, 2025 •

edited

Loading

github-actions bot commented Dec 11, 2025 •

edited

Loading

github-actions bot commented Dec 11, 2025 •

edited

Loading

arsenm left a comment •

edited

Loading

arsenm commented Dec 13, 2025 •

edited

Loading

bjacob commented Dec 18, 2025 •

edited

Loading