-
Notifications
You must be signed in to change notification settings - Fork 15.6k
fix llvm.fma.f16 double rounding issue when there is no native support
#171904
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-nvptx @llvm/pr-subscribers-backend-systemz Author: Folkert de Vries (folkertdev) Changesfixes #98389 As the issue describes, promoting I could not find examples of a libcall being used for fma, but that's something that could be looked in separately to work around code size issues. Full diff: https://github.com/llvm/llvm-project/pull/171904.diff 5 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index dcf2df305d24a..4c74cc9ebe061 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6919,12 +6919,43 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
getValue(I.getArgOperand(0)), Flags));
return;
}
- case Intrinsic::fma:
- setValue(&I, DAG.getNode(
- ISD::FMA, sdl, getValue(I.getArgOperand(0)).getValueType(),
- getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)),
- getValue(I.getArgOperand(2)), Flags));
+ case Intrinsic::fma: {
+ SDValue A = getValue(I.getArgOperand(0));
+ SDValue B = getValue(I.getArgOperand(1));
+ SDValue C = getValue(I.getArgOperand(2));
+
+ Type *Ty = I.getType();
+ EVT VT = TLI.getValueType(DAG.getDataLayout(), Ty);
+ if (Ty->isHalfTy() && !TLI.isOperationLegalOrCustom(ISD::FMA, VT)) {
+ // An f16 fma must go via f64 to prevent double rounding issues.
+
+ EVT HalfVT = VT;
+ EVT DoubleVT = MVT::f64;
+
+ SDValue A64 = DAG.getNode(ISD::FP_EXTEND, sdl, DoubleVT, A);
+ SDValue B64 = DAG.getNode(ISD::FP_EXTEND, sdl, DoubleVT, B);
+ SDValue C64 = DAG.getNode(ISD::FP_EXTEND, sdl, DoubleVT, C);
+
+ // Prefer FMA in double if the target has it (optimizes better).
+ SDValue Fma64;
+ if (TLI.isOperationLegalOrCustom(ISD::FMA, DoubleVT)) {
+ Fma64 = DAG.getNode(ISD::FMA, sdl, DoubleVT, A64, B64, C64, Flags);
+ } else {
+ SDValue Mul = DAG.getNode(ISD::FMUL, sdl, DoubleVT, A64, B64, Flags);
+ Fma64 = DAG.getNode(ISD::FADD, sdl, DoubleVT, Mul, C64, Flags);
+ }
+
+ SDValue ResHalf =
+ DAG.getNode(ISD::FP_ROUND, sdl, HalfVT, Fma64,
+ DAG.getIntPtrConstant(0, sdl, /*isTarget=*/true));
+
+ setValue(&I, ResHalf);
+ } else {
+ setValue(&I,
+ DAG.getNode(ISD::FMA, sdl, A.getValueType(), A, B, C, Flags));
+ }
return;
+ }
#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC) \
case Intrinsic::INTRINSIC:
#include "llvm/IR/ConstrainedOps.def"
diff --git a/llvm/test/CodeGen/ARM/fp16-promote.ll b/llvm/test/CodeGen/ARM/fp16-promote.ll
index 8230e47259dd8..27a0bf2eb9037 100644
--- a/llvm/test/CodeGen/ARM/fp16-promote.ll
+++ b/llvm/test/CodeGen/ARM/fp16-promote.ll
@@ -1508,61 +1508,81 @@ define void @test_fma(ptr %p, ptr %q, ptr %r) #0 {
; CHECK-FP16-NEXT: push {r4, lr}
; CHECK-FP16-NEXT: mov r4, r0
; CHECK-FP16-NEXT: ldrh r0, [r1]
-; CHECK-FP16-NEXT: ldrh r1, [r4]
-; CHECK-FP16-NEXT: ldrh r2, [r2]
-; CHECK-FP16-NEXT: vmov s2, r0
+; CHECK-FP16-NEXT: ldrh r1, [r2]
+; CHECK-FP16-NEXT: vmov s0, r0
+; CHECK-FP16-NEXT: ldrh r0, [r4]
+; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0
+; CHECK-FP16-NEXT: vcvt.f64.f32 d16, s0
+; CHECK-FP16-NEXT: vmov s0, r0
+; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0
+; CHECK-FP16-NEXT: vcvt.f64.f32 d17, s0
; CHECK-FP16-NEXT: vmov s0, r1
-; CHECK-FP16-NEXT: vcvtb.f32.f16 s1, s2
-; CHECK-FP16-NEXT: vmov s2, r2
; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0
-; CHECK-FP16-NEXT: vcvtb.f32.f16 s2, s2
-; CHECK-FP16-NEXT: bl fmaf
-; CHECK-FP16-NEXT: vcvtb.f16.f32 s0, s0
-; CHECK-FP16-NEXT: vmov r0, s0
+; CHECK-FP16-NEXT: vcvt.f64.f32 d18, s0
+; CHECK-FP16-NEXT: vmla.f64 d18, d17, d16
+; CHECK-FP16-NEXT: vmov r0, r1, d18
+; CHECK-FP16-NEXT: bl __aeabi_d2h
; CHECK-FP16-NEXT: strh r0, [r4]
; CHECK-FP16-NEXT: pop {r4, pc}
;
; CHECK-LIBCALL-VFP-LABEL: test_fma:
; CHECK-LIBCALL-VFP: .save {r4, r5, r6, lr}
; CHECK-LIBCALL-VFP-NEXT: push {r4, r5, r6, lr}
+; CHECK-LIBCALL-VFP-NEXT: .vsave {d8, d9}
+; CHECK-LIBCALL-VFP-NEXT: vpush {d8, d9}
; CHECK-LIBCALL-VFP-NEXT: mov r4, r0
-; CHECK-LIBCALL-VFP-NEXT: ldrh r0, [r2]
-; CHECK-LIBCALL-VFP-NEXT: mov r5, r1
+; CHECK-LIBCALL-VFP-NEXT: ldrh r0, [r0]
+; CHECK-LIBCALL-VFP-NEXT: mov r5, r2
+; CHECK-LIBCALL-VFP-NEXT: mov r6, r1
; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_h2f
-; CHECK-LIBCALL-VFP-NEXT: mov r6, r0
-; CHECK-LIBCALL-VFP-NEXT: ldrh r0, [r5]
+; CHECK-LIBCALL-VFP-NEXT: ldrh r1, [r6]
+; CHECK-LIBCALL-VFP-NEXT: vmov s16, r0
+; CHECK-LIBCALL-VFP-NEXT: ldrh r5, [r5]
+; CHECK-LIBCALL-VFP-NEXT: mov r0, r1
; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_h2f
-; CHECK-LIBCALL-VFP-NEXT: mov r5, r0
-; CHECK-LIBCALL-VFP-NEXT: ldrh r0, [r4]
+; CHECK-LIBCALL-VFP-NEXT: vmov s18, r0
+; CHECK-LIBCALL-VFP-NEXT: mov r0, r5
; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_h2f
; CHECK-LIBCALL-VFP-NEXT: vmov s0, r0
-; CHECK-LIBCALL-VFP-NEXT: vmov s1, r5
-; CHECK-LIBCALL-VFP-NEXT: vmov s2, r6
-; CHECK-LIBCALL-VFP-NEXT: bl fmaf
-; CHECK-LIBCALL-VFP-NEXT: vmov r0, s0
-; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_f2h
+; CHECK-LIBCALL-VFP-NEXT: vcvt.f64.f32 d16, s18
+; CHECK-LIBCALL-VFP-NEXT: vcvt.f64.f32 d17, s16
+; CHECK-LIBCALL-VFP-NEXT: vcvt.f64.f32 d18, s0
+; CHECK-LIBCALL-VFP-NEXT: vmla.f64 d18, d17, d16
+; CHECK-LIBCALL-VFP-NEXT: vmov r0, r1, d18
+; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_d2h
; CHECK-LIBCALL-VFP-NEXT: strh r0, [r4]
+; CHECK-LIBCALL-VFP-NEXT: vpop {d8, d9}
; CHECK-LIBCALL-VFP-NEXT: pop {r4, r5, r6, pc}
;
; CHECK-NOVFP-LABEL: test_fma:
-; CHECK-NOVFP: .save {r4, r5, r6, lr}
-; CHECK-NOVFP-NEXT: push {r4, r5, r6, lr}
+; CHECK-NOVFP: .save {r4, r5, r6, r7, r11, lr}
+; CHECK-NOVFP-NEXT: push {r4, r5, r6, r7, r11, lr}
; CHECK-NOVFP-NEXT: mov r4, r0
; CHECK-NOVFP-NEXT: ldrh r0, [r1]
; CHECK-NOVFP-NEXT: mov r5, r2
; CHECK-NOVFP-NEXT: bl __aeabi_h2f
+; CHECK-NOVFP-NEXT: bl __aeabi_f2d
; CHECK-NOVFP-NEXT: mov r6, r0
-; CHECK-NOVFP-NEXT: ldrh r0, [r5]
-; CHECK-NOVFP-NEXT: bl __aeabi_h2f
-; CHECK-NOVFP-NEXT: mov r5, r0
; CHECK-NOVFP-NEXT: ldrh r0, [r4]
+; CHECK-NOVFP-NEXT: mov r7, r1
; CHECK-NOVFP-NEXT: bl __aeabi_h2f
-; CHECK-NOVFP-NEXT: mov r1, r6
-; CHECK-NOVFP-NEXT: mov r2, r5
-; CHECK-NOVFP-NEXT: bl fmaf
-; CHECK-NOVFP-NEXT: bl __aeabi_f2h
+; CHECK-NOVFP-NEXT: bl __aeabi_f2d
+; CHECK-NOVFP-NEXT: mov r2, r6
+; CHECK-NOVFP-NEXT: mov r3, r7
+; CHECK-NOVFP-NEXT: bl __aeabi_dmul
+; CHECK-NOVFP-NEXT: mov r6, r0
+; CHECK-NOVFP-NEXT: ldrh r0, [r5]
+; CHECK-NOVFP-NEXT: mov r7, r1
+; CHECK-NOVFP-NEXT: bl __aeabi_h2f
+; CHECK-NOVFP-NEXT: bl __aeabi_f2d
+; CHECK-NOVFP-NEXT: mov r2, r0
+; CHECK-NOVFP-NEXT: mov r3, r1
+; CHECK-NOVFP-NEXT: mov r0, r6
+; CHECK-NOVFP-NEXT: mov r1, r7
+; CHECK-NOVFP-NEXT: bl __aeabi_dadd
+; CHECK-NOVFP-NEXT: bl __aeabi_d2h
; CHECK-NOVFP-NEXT: strh r0, [r4]
-; CHECK-NOVFP-NEXT: pop {r4, r5, r6, pc}
+; CHECK-NOVFP-NEXT: pop {r4, r5, r6, r7, r11, pc}
%a = load half, ptr %p, align 2
%b = load half, ptr %q, align 2
%c = load half, ptr %r, align 2
diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-06.ll b/llvm/test/CodeGen/SystemZ/fp-mul-06.ll
index 6b285a49057dc..f46fb92c59ba5 100644
--- a/llvm/test/CodeGen/SystemZ/fp-mul-06.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-mul-06.ll
@@ -5,15 +5,16 @@
declare half @llvm.fma.f16(half %f1, half %f2, half %f3)
declare float @llvm.fma.f32(float %f1, float %f2, float %f3)
+declare double @llvm.fma.f64(double %f1, double %f2, double %f3)
define half @f0(half %f1, half %f2, half %acc) {
; CHECK-LABEL: f0:
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK-SCALAR: maebr %f0, %f9, %f10
-; CHECK-VECTOR: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK-SCALAR: madbr %f0, %f9, %f10
+; CHECK-VECTOR: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2@PLT
; CHECK: br %r14
%res = call half @llvm.fma.f16 (half %f1, half %f2, half %acc)
ret half %res
@@ -29,6 +30,16 @@ define float @f1(float %f1, float %f2, float %acc) {
ret float %res
}
+define double @f9(double %f1, double %f2, double %acc) {
+; CHECK-LABEL: f9:
+; CHECK-SCALAR: madbr %f4, %f0, %f2
+; CHECK-SCALAR: ldr %f0, %f4
+; CHECK-VECTOR: wfmadb %f0, %f0, %f2, %f4
+; CHECK: br %r14
+ %res = call double @llvm.fma.f64 (double %f1, double %f2, double %acc)
+ ret double %res
+}
+
define float @f2(float %f1, ptr %ptr, float %acc) {
; CHECK-LABEL: f2:
; CHECK: maeb %f2, %f0, 0(%r2)
diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-08.ll b/llvm/test/CodeGen/SystemZ/fp-mul-08.ll
index e739bddd4f18f..542cae41d4745 100644
--- a/llvm/test/CodeGen/SystemZ/fp-mul-08.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-mul-08.ll
@@ -10,12 +10,12 @@ define half @f0(half %f1, half %f2, half %acc) {
; CHECK-LABEL: f0:
; CHECK-NOT: brasl
; CHECK: lcdfr %f{{[0-9]+}}, %f4
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK-SCALAR: maebr %f0, %f8, %f10
-; CHECK-VECTOR: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK-SCALAR: madbr %f0, %f8, %f10
+; CHECK-VECTOR: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2@PLT
; CHECK: br %r14
%negacc = fneg half %acc
%res = call half @llvm.fma.f16 (half %f1, half %f2, half %negacc)
diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-10.ll b/llvm/test/CodeGen/SystemZ/fp-mul-10.ll
index 8f2cd23112cd0..0badf2993cca7 100644
--- a/llvm/test/CodeGen/SystemZ/fp-mul-10.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-mul-10.ll
@@ -25,11 +25,11 @@ define double @f2(double %f1, double %f2, double %acc) {
define half @f3_half(half %f1, half %f2, half %acc) {
; CHECK-LABEL: f3_half:
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2@PLT
; CHECK-NOT: brasl
; CHECK: lcdfr %f0, %f0
; CHECK-NEXT: lmg
@@ -52,11 +52,11 @@ define half @f4_half(half %f1, half %f2, half %acc) {
; CHECK-LABEL: f4_half:
; CHECK-NOT: brasl
; CHECK: lcdfr %f0, %f4
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2@PLT
; CHECK-NOT: brasl
; CHECK: lcdfr %f0, %f0
; CHECK-NEXT: lmg
|
|
@llvm/pr-subscribers-llvm-selectiondag Author: Folkert de Vries (folkertdev) Changesfixes #98389 As the issue describes, promoting I could not find examples of a libcall being used for fma, but that's something that could be looked in separately to work around code size issues. Full diff: https://github.com/llvm/llvm-project/pull/171904.diff 5 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index dcf2df305d24a..4c74cc9ebe061 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6919,12 +6919,43 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
getValue(I.getArgOperand(0)), Flags));
return;
}
- case Intrinsic::fma:
- setValue(&I, DAG.getNode(
- ISD::FMA, sdl, getValue(I.getArgOperand(0)).getValueType(),
- getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)),
- getValue(I.getArgOperand(2)), Flags));
+ case Intrinsic::fma: {
+ SDValue A = getValue(I.getArgOperand(0));
+ SDValue B = getValue(I.getArgOperand(1));
+ SDValue C = getValue(I.getArgOperand(2));
+
+ Type *Ty = I.getType();
+ EVT VT = TLI.getValueType(DAG.getDataLayout(), Ty);
+ if (Ty->isHalfTy() && !TLI.isOperationLegalOrCustom(ISD::FMA, VT)) {
+ // An f16 fma must go via f64 to prevent double rounding issues.
+
+ EVT HalfVT = VT;
+ EVT DoubleVT = MVT::f64;
+
+ SDValue A64 = DAG.getNode(ISD::FP_EXTEND, sdl, DoubleVT, A);
+ SDValue B64 = DAG.getNode(ISD::FP_EXTEND, sdl, DoubleVT, B);
+ SDValue C64 = DAG.getNode(ISD::FP_EXTEND, sdl, DoubleVT, C);
+
+ // Prefer FMA in double if the target has it (optimizes better).
+ SDValue Fma64;
+ if (TLI.isOperationLegalOrCustom(ISD::FMA, DoubleVT)) {
+ Fma64 = DAG.getNode(ISD::FMA, sdl, DoubleVT, A64, B64, C64, Flags);
+ } else {
+ SDValue Mul = DAG.getNode(ISD::FMUL, sdl, DoubleVT, A64, B64, Flags);
+ Fma64 = DAG.getNode(ISD::FADD, sdl, DoubleVT, Mul, C64, Flags);
+ }
+
+ SDValue ResHalf =
+ DAG.getNode(ISD::FP_ROUND, sdl, HalfVT, Fma64,
+ DAG.getIntPtrConstant(0, sdl, /*isTarget=*/true));
+
+ setValue(&I, ResHalf);
+ } else {
+ setValue(&I,
+ DAG.getNode(ISD::FMA, sdl, A.getValueType(), A, B, C, Flags));
+ }
return;
+ }
#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC) \
case Intrinsic::INTRINSIC:
#include "llvm/IR/ConstrainedOps.def"
diff --git a/llvm/test/CodeGen/ARM/fp16-promote.ll b/llvm/test/CodeGen/ARM/fp16-promote.ll
index 8230e47259dd8..27a0bf2eb9037 100644
--- a/llvm/test/CodeGen/ARM/fp16-promote.ll
+++ b/llvm/test/CodeGen/ARM/fp16-promote.ll
@@ -1508,61 +1508,81 @@ define void @test_fma(ptr %p, ptr %q, ptr %r) #0 {
; CHECK-FP16-NEXT: push {r4, lr}
; CHECK-FP16-NEXT: mov r4, r0
; CHECK-FP16-NEXT: ldrh r0, [r1]
-; CHECK-FP16-NEXT: ldrh r1, [r4]
-; CHECK-FP16-NEXT: ldrh r2, [r2]
-; CHECK-FP16-NEXT: vmov s2, r0
+; CHECK-FP16-NEXT: ldrh r1, [r2]
+; CHECK-FP16-NEXT: vmov s0, r0
+; CHECK-FP16-NEXT: ldrh r0, [r4]
+; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0
+; CHECK-FP16-NEXT: vcvt.f64.f32 d16, s0
+; CHECK-FP16-NEXT: vmov s0, r0
+; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0
+; CHECK-FP16-NEXT: vcvt.f64.f32 d17, s0
; CHECK-FP16-NEXT: vmov s0, r1
-; CHECK-FP16-NEXT: vcvtb.f32.f16 s1, s2
-; CHECK-FP16-NEXT: vmov s2, r2
; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0
-; CHECK-FP16-NEXT: vcvtb.f32.f16 s2, s2
-; CHECK-FP16-NEXT: bl fmaf
-; CHECK-FP16-NEXT: vcvtb.f16.f32 s0, s0
-; CHECK-FP16-NEXT: vmov r0, s0
+; CHECK-FP16-NEXT: vcvt.f64.f32 d18, s0
+; CHECK-FP16-NEXT: vmla.f64 d18, d17, d16
+; CHECK-FP16-NEXT: vmov r0, r1, d18
+; CHECK-FP16-NEXT: bl __aeabi_d2h
; CHECK-FP16-NEXT: strh r0, [r4]
; CHECK-FP16-NEXT: pop {r4, pc}
;
; CHECK-LIBCALL-VFP-LABEL: test_fma:
; CHECK-LIBCALL-VFP: .save {r4, r5, r6, lr}
; CHECK-LIBCALL-VFP-NEXT: push {r4, r5, r6, lr}
+; CHECK-LIBCALL-VFP-NEXT: .vsave {d8, d9}
+; CHECK-LIBCALL-VFP-NEXT: vpush {d8, d9}
; CHECK-LIBCALL-VFP-NEXT: mov r4, r0
-; CHECK-LIBCALL-VFP-NEXT: ldrh r0, [r2]
-; CHECK-LIBCALL-VFP-NEXT: mov r5, r1
+; CHECK-LIBCALL-VFP-NEXT: ldrh r0, [r0]
+; CHECK-LIBCALL-VFP-NEXT: mov r5, r2
+; CHECK-LIBCALL-VFP-NEXT: mov r6, r1
; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_h2f
-; CHECK-LIBCALL-VFP-NEXT: mov r6, r0
-; CHECK-LIBCALL-VFP-NEXT: ldrh r0, [r5]
+; CHECK-LIBCALL-VFP-NEXT: ldrh r1, [r6]
+; CHECK-LIBCALL-VFP-NEXT: vmov s16, r0
+; CHECK-LIBCALL-VFP-NEXT: ldrh r5, [r5]
+; CHECK-LIBCALL-VFP-NEXT: mov r0, r1
; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_h2f
-; CHECK-LIBCALL-VFP-NEXT: mov r5, r0
-; CHECK-LIBCALL-VFP-NEXT: ldrh r0, [r4]
+; CHECK-LIBCALL-VFP-NEXT: vmov s18, r0
+; CHECK-LIBCALL-VFP-NEXT: mov r0, r5
; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_h2f
; CHECK-LIBCALL-VFP-NEXT: vmov s0, r0
-; CHECK-LIBCALL-VFP-NEXT: vmov s1, r5
-; CHECK-LIBCALL-VFP-NEXT: vmov s2, r6
-; CHECK-LIBCALL-VFP-NEXT: bl fmaf
-; CHECK-LIBCALL-VFP-NEXT: vmov r0, s0
-; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_f2h
+; CHECK-LIBCALL-VFP-NEXT: vcvt.f64.f32 d16, s18
+; CHECK-LIBCALL-VFP-NEXT: vcvt.f64.f32 d17, s16
+; CHECK-LIBCALL-VFP-NEXT: vcvt.f64.f32 d18, s0
+; CHECK-LIBCALL-VFP-NEXT: vmla.f64 d18, d17, d16
+; CHECK-LIBCALL-VFP-NEXT: vmov r0, r1, d18
+; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_d2h
; CHECK-LIBCALL-VFP-NEXT: strh r0, [r4]
+; CHECK-LIBCALL-VFP-NEXT: vpop {d8, d9}
; CHECK-LIBCALL-VFP-NEXT: pop {r4, r5, r6, pc}
;
; CHECK-NOVFP-LABEL: test_fma:
-; CHECK-NOVFP: .save {r4, r5, r6, lr}
-; CHECK-NOVFP-NEXT: push {r4, r5, r6, lr}
+; CHECK-NOVFP: .save {r4, r5, r6, r7, r11, lr}
+; CHECK-NOVFP-NEXT: push {r4, r5, r6, r7, r11, lr}
; CHECK-NOVFP-NEXT: mov r4, r0
; CHECK-NOVFP-NEXT: ldrh r0, [r1]
; CHECK-NOVFP-NEXT: mov r5, r2
; CHECK-NOVFP-NEXT: bl __aeabi_h2f
+; CHECK-NOVFP-NEXT: bl __aeabi_f2d
; CHECK-NOVFP-NEXT: mov r6, r0
-; CHECK-NOVFP-NEXT: ldrh r0, [r5]
-; CHECK-NOVFP-NEXT: bl __aeabi_h2f
-; CHECK-NOVFP-NEXT: mov r5, r0
; CHECK-NOVFP-NEXT: ldrh r0, [r4]
+; CHECK-NOVFP-NEXT: mov r7, r1
; CHECK-NOVFP-NEXT: bl __aeabi_h2f
-; CHECK-NOVFP-NEXT: mov r1, r6
-; CHECK-NOVFP-NEXT: mov r2, r5
-; CHECK-NOVFP-NEXT: bl fmaf
-; CHECK-NOVFP-NEXT: bl __aeabi_f2h
+; CHECK-NOVFP-NEXT: bl __aeabi_f2d
+; CHECK-NOVFP-NEXT: mov r2, r6
+; CHECK-NOVFP-NEXT: mov r3, r7
+; CHECK-NOVFP-NEXT: bl __aeabi_dmul
+; CHECK-NOVFP-NEXT: mov r6, r0
+; CHECK-NOVFP-NEXT: ldrh r0, [r5]
+; CHECK-NOVFP-NEXT: mov r7, r1
+; CHECK-NOVFP-NEXT: bl __aeabi_h2f
+; CHECK-NOVFP-NEXT: bl __aeabi_f2d
+; CHECK-NOVFP-NEXT: mov r2, r0
+; CHECK-NOVFP-NEXT: mov r3, r1
+; CHECK-NOVFP-NEXT: mov r0, r6
+; CHECK-NOVFP-NEXT: mov r1, r7
+; CHECK-NOVFP-NEXT: bl __aeabi_dadd
+; CHECK-NOVFP-NEXT: bl __aeabi_d2h
; CHECK-NOVFP-NEXT: strh r0, [r4]
-; CHECK-NOVFP-NEXT: pop {r4, r5, r6, pc}
+; CHECK-NOVFP-NEXT: pop {r4, r5, r6, r7, r11, pc}
%a = load half, ptr %p, align 2
%b = load half, ptr %q, align 2
%c = load half, ptr %r, align 2
diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-06.ll b/llvm/test/CodeGen/SystemZ/fp-mul-06.ll
index 6b285a49057dc..f46fb92c59ba5 100644
--- a/llvm/test/CodeGen/SystemZ/fp-mul-06.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-mul-06.ll
@@ -5,15 +5,16 @@
declare half @llvm.fma.f16(half %f1, half %f2, half %f3)
declare float @llvm.fma.f32(float %f1, float %f2, float %f3)
+declare double @llvm.fma.f64(double %f1, double %f2, double %f3)
define half @f0(half %f1, half %f2, half %acc) {
; CHECK-LABEL: f0:
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK-SCALAR: maebr %f0, %f9, %f10
-; CHECK-VECTOR: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK-SCALAR: madbr %f0, %f9, %f10
+; CHECK-VECTOR: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2@PLT
; CHECK: br %r14
%res = call half @llvm.fma.f16 (half %f1, half %f2, half %acc)
ret half %res
@@ -29,6 +30,16 @@ define float @f1(float %f1, float %f2, float %acc) {
ret float %res
}
+define double @f9(double %f1, double %f2, double %acc) {
+; CHECK-LABEL: f9:
+; CHECK-SCALAR: madbr %f4, %f0, %f2
+; CHECK-SCALAR: ldr %f0, %f4
+; CHECK-VECTOR: wfmadb %f0, %f0, %f2, %f4
+; CHECK: br %r14
+ %res = call double @llvm.fma.f64 (double %f1, double %f2, double %acc)
+ ret double %res
+}
+
define float @f2(float %f1, ptr %ptr, float %acc) {
; CHECK-LABEL: f2:
; CHECK: maeb %f2, %f0, 0(%r2)
diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-08.ll b/llvm/test/CodeGen/SystemZ/fp-mul-08.ll
index e739bddd4f18f..542cae41d4745 100644
--- a/llvm/test/CodeGen/SystemZ/fp-mul-08.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-mul-08.ll
@@ -10,12 +10,12 @@ define half @f0(half %f1, half %f2, half %acc) {
; CHECK-LABEL: f0:
; CHECK-NOT: brasl
; CHECK: lcdfr %f{{[0-9]+}}, %f4
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK-SCALAR: maebr %f0, %f8, %f10
-; CHECK-VECTOR: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK-SCALAR: madbr %f0, %f8, %f10
+; CHECK-VECTOR: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2@PLT
; CHECK: br %r14
%negacc = fneg half %acc
%res = call half @llvm.fma.f16 (half %f1, half %f2, half %negacc)
diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-10.ll b/llvm/test/CodeGen/SystemZ/fp-mul-10.ll
index 8f2cd23112cd0..0badf2993cca7 100644
--- a/llvm/test/CodeGen/SystemZ/fp-mul-10.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-mul-10.ll
@@ -25,11 +25,11 @@ define double @f2(double %f1, double %f2, double %acc) {
define half @f3_half(half %f1, half %f2, half %acc) {
; CHECK-LABEL: f3_half:
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2@PLT
; CHECK-NOT: brasl
; CHECK: lcdfr %f0, %f0
; CHECK-NEXT: lmg
@@ -52,11 +52,11 @@ define half @f4_half(half %f1, half %f2, half %acc) {
; CHECK-LABEL: f4_half:
; CHECK-NOT: brasl
; CHECK: lcdfr %f0, %f4
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2@PLT
; CHECK-NOT: brasl
; CHECK: lcdfr %f0, %f0
; CHECK-NEXT: lmg
|
|
@llvm/pr-subscribers-backend-arm Author: Folkert de Vries (folkertdev) Changesfixes #98389 As the issue describes, promoting I could not find examples of a libcall being used for fma, but that's something that could be looked in separately to work around code size issues. Full diff: https://github.com/llvm/llvm-project/pull/171904.diff 5 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index dcf2df305d24a..4c74cc9ebe061 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6919,12 +6919,43 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
getValue(I.getArgOperand(0)), Flags));
return;
}
- case Intrinsic::fma:
- setValue(&I, DAG.getNode(
- ISD::FMA, sdl, getValue(I.getArgOperand(0)).getValueType(),
- getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)),
- getValue(I.getArgOperand(2)), Flags));
+ case Intrinsic::fma: {
+ SDValue A = getValue(I.getArgOperand(0));
+ SDValue B = getValue(I.getArgOperand(1));
+ SDValue C = getValue(I.getArgOperand(2));
+
+ Type *Ty = I.getType();
+ EVT VT = TLI.getValueType(DAG.getDataLayout(), Ty);
+ if (Ty->isHalfTy() && !TLI.isOperationLegalOrCustom(ISD::FMA, VT)) {
+ // An f16 fma must go via f64 to prevent double rounding issues.
+
+ EVT HalfVT = VT;
+ EVT DoubleVT = MVT::f64;
+
+ SDValue A64 = DAG.getNode(ISD::FP_EXTEND, sdl, DoubleVT, A);
+ SDValue B64 = DAG.getNode(ISD::FP_EXTEND, sdl, DoubleVT, B);
+ SDValue C64 = DAG.getNode(ISD::FP_EXTEND, sdl, DoubleVT, C);
+
+ // Prefer FMA in double if the target has it (optimizes better).
+ SDValue Fma64;
+ if (TLI.isOperationLegalOrCustom(ISD::FMA, DoubleVT)) {
+ Fma64 = DAG.getNode(ISD::FMA, sdl, DoubleVT, A64, B64, C64, Flags);
+ } else {
+ SDValue Mul = DAG.getNode(ISD::FMUL, sdl, DoubleVT, A64, B64, Flags);
+ Fma64 = DAG.getNode(ISD::FADD, sdl, DoubleVT, Mul, C64, Flags);
+ }
+
+ SDValue ResHalf =
+ DAG.getNode(ISD::FP_ROUND, sdl, HalfVT, Fma64,
+ DAG.getIntPtrConstant(0, sdl, /*isTarget=*/true));
+
+ setValue(&I, ResHalf);
+ } else {
+ setValue(&I,
+ DAG.getNode(ISD::FMA, sdl, A.getValueType(), A, B, C, Flags));
+ }
return;
+ }
#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC) \
case Intrinsic::INTRINSIC:
#include "llvm/IR/ConstrainedOps.def"
diff --git a/llvm/test/CodeGen/ARM/fp16-promote.ll b/llvm/test/CodeGen/ARM/fp16-promote.ll
index 8230e47259dd8..27a0bf2eb9037 100644
--- a/llvm/test/CodeGen/ARM/fp16-promote.ll
+++ b/llvm/test/CodeGen/ARM/fp16-promote.ll
@@ -1508,61 +1508,81 @@ define void @test_fma(ptr %p, ptr %q, ptr %r) #0 {
; CHECK-FP16-NEXT: push {r4, lr}
; CHECK-FP16-NEXT: mov r4, r0
; CHECK-FP16-NEXT: ldrh r0, [r1]
-; CHECK-FP16-NEXT: ldrh r1, [r4]
-; CHECK-FP16-NEXT: ldrh r2, [r2]
-; CHECK-FP16-NEXT: vmov s2, r0
+; CHECK-FP16-NEXT: ldrh r1, [r2]
+; CHECK-FP16-NEXT: vmov s0, r0
+; CHECK-FP16-NEXT: ldrh r0, [r4]
+; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0
+; CHECK-FP16-NEXT: vcvt.f64.f32 d16, s0
+; CHECK-FP16-NEXT: vmov s0, r0
+; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0
+; CHECK-FP16-NEXT: vcvt.f64.f32 d17, s0
; CHECK-FP16-NEXT: vmov s0, r1
-; CHECK-FP16-NEXT: vcvtb.f32.f16 s1, s2
-; CHECK-FP16-NEXT: vmov s2, r2
; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0
-; CHECK-FP16-NEXT: vcvtb.f32.f16 s2, s2
-; CHECK-FP16-NEXT: bl fmaf
-; CHECK-FP16-NEXT: vcvtb.f16.f32 s0, s0
-; CHECK-FP16-NEXT: vmov r0, s0
+; CHECK-FP16-NEXT: vcvt.f64.f32 d18, s0
+; CHECK-FP16-NEXT: vmla.f64 d18, d17, d16
+; CHECK-FP16-NEXT: vmov r0, r1, d18
+; CHECK-FP16-NEXT: bl __aeabi_d2h
; CHECK-FP16-NEXT: strh r0, [r4]
; CHECK-FP16-NEXT: pop {r4, pc}
;
; CHECK-LIBCALL-VFP-LABEL: test_fma:
; CHECK-LIBCALL-VFP: .save {r4, r5, r6, lr}
; CHECK-LIBCALL-VFP-NEXT: push {r4, r5, r6, lr}
+; CHECK-LIBCALL-VFP-NEXT: .vsave {d8, d9}
+; CHECK-LIBCALL-VFP-NEXT: vpush {d8, d9}
; CHECK-LIBCALL-VFP-NEXT: mov r4, r0
-; CHECK-LIBCALL-VFP-NEXT: ldrh r0, [r2]
-; CHECK-LIBCALL-VFP-NEXT: mov r5, r1
+; CHECK-LIBCALL-VFP-NEXT: ldrh r0, [r0]
+; CHECK-LIBCALL-VFP-NEXT: mov r5, r2
+; CHECK-LIBCALL-VFP-NEXT: mov r6, r1
; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_h2f
-; CHECK-LIBCALL-VFP-NEXT: mov r6, r0
-; CHECK-LIBCALL-VFP-NEXT: ldrh r0, [r5]
+; CHECK-LIBCALL-VFP-NEXT: ldrh r1, [r6]
+; CHECK-LIBCALL-VFP-NEXT: vmov s16, r0
+; CHECK-LIBCALL-VFP-NEXT: ldrh r5, [r5]
+; CHECK-LIBCALL-VFP-NEXT: mov r0, r1
; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_h2f
-; CHECK-LIBCALL-VFP-NEXT: mov r5, r0
-; CHECK-LIBCALL-VFP-NEXT: ldrh r0, [r4]
+; CHECK-LIBCALL-VFP-NEXT: vmov s18, r0
+; CHECK-LIBCALL-VFP-NEXT: mov r0, r5
; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_h2f
; CHECK-LIBCALL-VFP-NEXT: vmov s0, r0
-; CHECK-LIBCALL-VFP-NEXT: vmov s1, r5
-; CHECK-LIBCALL-VFP-NEXT: vmov s2, r6
-; CHECK-LIBCALL-VFP-NEXT: bl fmaf
-; CHECK-LIBCALL-VFP-NEXT: vmov r0, s0
-; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_f2h
+; CHECK-LIBCALL-VFP-NEXT: vcvt.f64.f32 d16, s18
+; CHECK-LIBCALL-VFP-NEXT: vcvt.f64.f32 d17, s16
+; CHECK-LIBCALL-VFP-NEXT: vcvt.f64.f32 d18, s0
+; CHECK-LIBCALL-VFP-NEXT: vmla.f64 d18, d17, d16
+; CHECK-LIBCALL-VFP-NEXT: vmov r0, r1, d18
+; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_d2h
; CHECK-LIBCALL-VFP-NEXT: strh r0, [r4]
+; CHECK-LIBCALL-VFP-NEXT: vpop {d8, d9}
; CHECK-LIBCALL-VFP-NEXT: pop {r4, r5, r6, pc}
;
; CHECK-NOVFP-LABEL: test_fma:
-; CHECK-NOVFP: .save {r4, r5, r6, lr}
-; CHECK-NOVFP-NEXT: push {r4, r5, r6, lr}
+; CHECK-NOVFP: .save {r4, r5, r6, r7, r11, lr}
+; CHECK-NOVFP-NEXT: push {r4, r5, r6, r7, r11, lr}
; CHECK-NOVFP-NEXT: mov r4, r0
; CHECK-NOVFP-NEXT: ldrh r0, [r1]
; CHECK-NOVFP-NEXT: mov r5, r2
; CHECK-NOVFP-NEXT: bl __aeabi_h2f
+; CHECK-NOVFP-NEXT: bl __aeabi_f2d
; CHECK-NOVFP-NEXT: mov r6, r0
-; CHECK-NOVFP-NEXT: ldrh r0, [r5]
-; CHECK-NOVFP-NEXT: bl __aeabi_h2f
-; CHECK-NOVFP-NEXT: mov r5, r0
; CHECK-NOVFP-NEXT: ldrh r0, [r4]
+; CHECK-NOVFP-NEXT: mov r7, r1
; CHECK-NOVFP-NEXT: bl __aeabi_h2f
-; CHECK-NOVFP-NEXT: mov r1, r6
-; CHECK-NOVFP-NEXT: mov r2, r5
-; CHECK-NOVFP-NEXT: bl fmaf
-; CHECK-NOVFP-NEXT: bl __aeabi_f2h
+; CHECK-NOVFP-NEXT: bl __aeabi_f2d
+; CHECK-NOVFP-NEXT: mov r2, r6
+; CHECK-NOVFP-NEXT: mov r3, r7
+; CHECK-NOVFP-NEXT: bl __aeabi_dmul
+; CHECK-NOVFP-NEXT: mov r6, r0
+; CHECK-NOVFP-NEXT: ldrh r0, [r5]
+; CHECK-NOVFP-NEXT: mov r7, r1
+; CHECK-NOVFP-NEXT: bl __aeabi_h2f
+; CHECK-NOVFP-NEXT: bl __aeabi_f2d
+; CHECK-NOVFP-NEXT: mov r2, r0
+; CHECK-NOVFP-NEXT: mov r3, r1
+; CHECK-NOVFP-NEXT: mov r0, r6
+; CHECK-NOVFP-NEXT: mov r1, r7
+; CHECK-NOVFP-NEXT: bl __aeabi_dadd
+; CHECK-NOVFP-NEXT: bl __aeabi_d2h
; CHECK-NOVFP-NEXT: strh r0, [r4]
-; CHECK-NOVFP-NEXT: pop {r4, r5, r6, pc}
+; CHECK-NOVFP-NEXT: pop {r4, r5, r6, r7, r11, pc}
%a = load half, ptr %p, align 2
%b = load half, ptr %q, align 2
%c = load half, ptr %r, align 2
diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-06.ll b/llvm/test/CodeGen/SystemZ/fp-mul-06.ll
index 6b285a49057dc..f46fb92c59ba5 100644
--- a/llvm/test/CodeGen/SystemZ/fp-mul-06.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-mul-06.ll
@@ -5,15 +5,16 @@
declare half @llvm.fma.f16(half %f1, half %f2, half %f3)
declare float @llvm.fma.f32(float %f1, float %f2, float %f3)
+declare double @llvm.fma.f64(double %f1, double %f2, double %f3)
define half @f0(half %f1, half %f2, half %acc) {
; CHECK-LABEL: f0:
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK-SCALAR: maebr %f0, %f9, %f10
-; CHECK-VECTOR: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK-SCALAR: madbr %f0, %f9, %f10
+; CHECK-VECTOR: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2@PLT
; CHECK: br %r14
%res = call half @llvm.fma.f16 (half %f1, half %f2, half %acc)
ret half %res
@@ -29,6 +30,16 @@ define float @f1(float %f1, float %f2, float %acc) {
ret float %res
}
+define double @f9(double %f1, double %f2, double %acc) {
+; CHECK-LABEL: f9:
+; CHECK-SCALAR: madbr %f4, %f0, %f2
+; CHECK-SCALAR: ldr %f0, %f4
+; CHECK-VECTOR: wfmadb %f0, %f0, %f2, %f4
+; CHECK: br %r14
+ %res = call double @llvm.fma.f64 (double %f1, double %f2, double %acc)
+ ret double %res
+}
+
define float @f2(float %f1, ptr %ptr, float %acc) {
; CHECK-LABEL: f2:
; CHECK: maeb %f2, %f0, 0(%r2)
diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-08.ll b/llvm/test/CodeGen/SystemZ/fp-mul-08.ll
index e739bddd4f18f..542cae41d4745 100644
--- a/llvm/test/CodeGen/SystemZ/fp-mul-08.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-mul-08.ll
@@ -10,12 +10,12 @@ define half @f0(half %f1, half %f2, half %acc) {
; CHECK-LABEL: f0:
; CHECK-NOT: brasl
; CHECK: lcdfr %f{{[0-9]+}}, %f4
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK-SCALAR: maebr %f0, %f8, %f10
-; CHECK-VECTOR: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK-SCALAR: madbr %f0, %f8, %f10
+; CHECK-VECTOR: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2@PLT
; CHECK: br %r14
%negacc = fneg half %acc
%res = call half @llvm.fma.f16 (half %f1, half %f2, half %negacc)
diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-10.ll b/llvm/test/CodeGen/SystemZ/fp-mul-10.ll
index 8f2cd23112cd0..0badf2993cca7 100644
--- a/llvm/test/CodeGen/SystemZ/fp-mul-10.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-mul-10.ll
@@ -25,11 +25,11 @@ define double @f2(double %f1, double %f2, double %acc) {
define half @f3_half(half %f1, half %f2, half %acc) {
; CHECK-LABEL: f3_half:
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2@PLT
; CHECK-NOT: brasl
; CHECK: lcdfr %f0, %f0
; CHECK-NEXT: lmg
@@ -52,11 +52,11 @@ define half @f4_half(half %f1, half %f2, half %acc) {
; CHECK-LABEL: f4_half:
; CHECK-NOT: brasl
; CHECK: lcdfr %f0, %f4
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: brasl %r14, __extendhfsf2@PLT
-; CHECK: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: brasl %r14, __extendhfdf2@PLT
+; CHECK: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2@PLT
; CHECK-NOT: brasl
; CHECK: lcdfr %f0, %f0
; CHECK-NEXT: lmg
|
I think this is pretty triple-dependent, e.g. glib and LLVM’s libraries have them but not much else does. I think what would be nice one day is for a frontend to have a way to indicate that a full C23 math library is available (with fmaf16, fmaf32, fmaf64, fmaf128 and similar available rather than just fma/fmaf/fmal), which would help with this plus a few other things (I’ve mentioned something similar to this to @arsenm before). But for now, unconditionally using f64 ops is definitely easiest. |
🪟 Windows x64 Test Results
✅ The build succeeded and all tests passed. |
🐧 Linux x64 Test Results
✅ The build succeeded and all tests passed. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This cannot go in SelectionDAGBuilder. This needs to be handled in LegalizeFloatTypes. In particular this will not catch FMAs later introduced during combine / legalize. It also doesn't remove the broken path, so those will still hit that
Hmm I see the problem, but conceptually, how would that work? I don't see any such custom logic for SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FMAD(SDNode *N) {
EVT OVT = N->getValueType(0);
EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0));
SDValue Op1 = GetSoftPromotedHalf(N->getOperand(1));
SDValue Op2 = GetSoftPromotedHalf(N->getOperand(2));
SDLoc dl(N);
// Promote to the larger FP type.
auto PromotionOpcode = GetPromotionOpcode(OVT, NVT);
Op0 = DAG.getNode(PromotionOpcode, dl, NVT, Op0);
Op1 = DAG.getNode(PromotionOpcode, dl, NVT, Op1);
Op2 = DAG.getNode(PromotionOpcode, dl, NVT, Op2);
SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1, Op2);
// Convert back to FP16 as an integer.
return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res);
}the type to promote to is apparently already known, and just picking something else probably won't go well? |
|
The easiest thing to do would be to take the default float promoted values, insert your own fp_extend from f32 to f64, and then insert your own truncate to the legal f32 for the result |
This should be implied by the triple. Nothing is stopping you from just adding these functions to RuntimeLibcallsInfo today |
|
I made the changes to
|
For SystemZ, we're not using
That's where I thought we should end up ... |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What about bf16? Can you express this in terms of the fundamental properties of the fltSemantics?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure, based on #131531 the bf16 variant should be a libcall unconditionally.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Libcall is a mechanical property, it doesn't change the type or rounding issues. The mixing of the libcall emission into type legalization for the soft float case is an unfortunate DAG-ism
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
huh, doesn't picking the bf16 fma libcall do the right thing? or are you saying that currently it would pick the f32 libcall (with associated promotion/rounding)?
In any case even promoting to f64 will not give accurate results for bf16, so I'm not sure what to do?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is currently no listing or knowledge of bf16 libcalls in llvm. I wouldn't be surprised ifthe promotion of bf16 on targets without legal float or need soft float is probably untested
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Well then I'm not sure what your suggestion is here? I don't see how we can implement anything valid for bf16?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In any case even promoting to f64 will not give accurate results for bf16
That surprises me, really? If we don't already have tests relying on it, I'd probably return poison and emitError
Those don't know about f16 libcalls, so those probably error if reached. |
Is that a problem? should they know about the f16 libcalls? adding an extra argument to |
Yes. I want tablegen to start generating all the get* libcall helper functions to assist this |
Sure, anything I should do here now though? |
|
the test failures are kind of annoying the |
The failing check in That will still fail if there are targets that do f64->f32->f16 rather than f64->f16. Maybe replace BAD-FMA with SKIP-FMA in these cases? |
Don't add tests to Generic. There's not really such thing as a "generic" codegen test; it's just a source of build pain. There's always a target dependency In this case the prior art is the sequence of %ifd run lines, so I guess keep going with that |
|
I don't want to add to those tests, but they fail because on some platforms the output is now different. So the output needs to be updated, but the standard |
I was not aware there was a requirement for all targets to use What specifically are the problems |
SystemZ makes f16 a legal type, so the PromoteFloat vs SoftPromoteHalf distinction is not relevant in that case: These are type legalization strategies, not op legalization strategies. SystemZ still does need to use SoftPromoteHalf in the case where To add some additional confusion here, there is the separate "Promote" op legalization action (handled by The FMA case here is handled by llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp Lines 5799 to 5807 in 4190d57
|
I see, thanks!
Right. We didn't notice that because on SystemZ the only user of
I think that's what @folkertdev mentions above, that we can fix fma on SystemZ via |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
would this fail if the target does not have a legal f64? I think that is the riscv error what CI runs into. What is the right thing to do in that case?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it looks like for RISCV if we have zfhmin without d we have the wonderful combination of legal f16, illegal f64 and no native f16 FMA. I'm not sure either what to do in that case. cc @topperc
Probably we need to emit a fmaf16 libcall in that case, but I don't think the infrastructure for that exists yet. Maybe for now just move this back into the SystemZ backend and open an issue to resolve the remaining RISCV issue separately?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This change is not just for SystemZ, but also AArch64, NVPTX and x86_64. So this bit of config has to be replicated quite a number of times.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see. I guess in that case you could explicitly set the promoted type to f32 in the RISCV backend instead if +d is not available...
438f8ba to
133aba5
Compare
|
I think the windows test failure is spurious if so, I think this is ready. |
|
Should anyone else review this? (it's unclear to me where the initiative lies) |
|
Heads up, that this is causing linking errors in a downstream (IREE) in CPU code generation with no native support for f16: |
|
OK, I understand the failure now but also have thoughts about this PR. Our immediate issue is that we carry our own bitcode library for builtin functions like f16 conversions, and we are missing However, going back to what this PR is doing, as a downstream, I wonder if we really should be promoting f16 to f64. I get that it's the technically correct thing to do, for the reason that you say. But it's also going to be surprising - I wasn't expecting f64 to be involved in f16 arithmetic until I read the rationale in this PR and the linked issue. And some users are going to prefer the slightly incorrect f32 arithmetic as the better trade-off for their application domain --- not every user cares about the last bit of mantissa enough to bother with f64. I can see that the code checks the subtarget for f64 support, which is good, but my experience shows that this doesn't catch all cases, as besides CPU target support there is also the question of existence of library symbols in downstreams. Moreover, the mere existence of f64 support doesn't imply for all users that it's the trade-off of performance vs correctness that they want. I feel that there should at least be an opt-out here. |
The compiler can't respect a random DIY ABI. If you're pretending to be compatible with the triple, you have to provide the full ABI.
Wrong is wrong. The legalization has the option of using the afn flag to keep promoting to f32, but that cannot be mandatory |
|
Point taken about the ABI argument. I think though that my other point about the tradeoff of correctness vs the various downsides of involving f64 in f16 arithmetic, still stand. I don't know a simple solution though. The subtarget would need to have finer controls than the current boolean yes/no f64 support. |
It's a bit of a moot point in this case. We don't have a compiler implementation of FMA to fall back on, so the alternative is error
There shouldn't be a mandatory control to degrade correctness. If you want to force the behavior, you can do your own cast to float. If you just need the optimization hint, this could use the fast math flag. How off is this result? Is there another fixup sequence that could be applied in float? |
|
Good idea to deal with this on our side by promoting f16 to f32 on targets where f16 isn't natively supported. We should just do that. Thanks. Also a good idea in principle to condition that on some kind of fast-math flag, but as you know LLVM fast-math flags are their own can of worms that I'm trying not to reopen just before the holidays :-) |
…ort (llvm#171904) fixes llvm#98389 As the issue describes, promoting `llvm.fma.f16` to `llvm.fma.f32` does not work, because there is not enough precision to handle the repeated rounding. `f64` does have sufficient space. So this PR explicitly promotes the 16-bit fma to a 64-bit fma. I could not find examples of a libcall being used for fma, but that's something that could be looked in separately to work around code size issues.
FWIW, Clang also does this kind of promotion, controllable by |
Integrate llvm/llvm-project@1c023cb This folds Lukas' commits from #22921. Thanks! Additional local change: adding `f16<->f64` conversions and `f64` `fma` to `libdevice` as these are now required by lowerings of `f16` multiply-add to targets without native `f16` support (typically CPU targets). * The symptom that this is fixing, is unresolved symbol errors at linking. * The regressing upstream PR is llvm/llvm-project#171904. * Rationale given there: it's necessary to go to `f64` to implement a perfectly accurate `f16` multiply-add without observable double-rounding. * We don't need to worry too much about the performance here: by definition, this code only happens when we use a data type that is not natively supported in hardware, so users who care about performance should not be on such a path anyway. This preexisting rationale is why the `libdevice` implementation of `f16` conversions favors simplicity over performance. Existing local reverts carried forward: * Local revert of llvm/llvm-project#169614 due #22649. * Also iree-org/llvm-project@dea9ec8 which had landed on top. --------- Signed-off-by: Benoit Jacob <[email protected]> Signed-off-by: Lukas Sommer <[email protected]> Co-authored-by: Lukas Sommer <[email protected]>
fixes #98389
As the issue describes, promoting
llvm.fma.f16tollvm.fma.f32does not work, because there is not enough precision to handle the repeated rounding.f64does have sufficient space. So this PR explicitly promotes the 16-bit fma to a 64-bit fma.I could not find examples of a libcall being used for fma, but that's something that could be looked in separately to work around code size issues.