diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index 383a025a4d916..72814a6890075 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -3507,6 +3507,7 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FMAD(SDNode *N) { SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0)); SDValue Op1 = GetSoftPromotedHalf(N->getOperand(1)); SDValue Op2 = GetSoftPromotedHalf(N->getOperand(2)); + SDNodeFlags Flags = N->getFlags(); SDLoc dl(N); // Promote to the larger FP type. @@ -3515,9 +3516,28 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FMAD(SDNode *N) { Op1 = DAG.getNode(PromotionOpcode, dl, NVT, Op1); Op2 = DAG.getNode(PromotionOpcode, dl, NVT, Op2); - SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1, Op2); + SDValue Res; + if (OVT == MVT::f16) { + // If f16 fma is not natively supported, the value must be promoted to an + // f64 (and not to f32!) to prevent double rounding issues. + SDValue A64 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Op0, Flags); + SDValue B64 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Op1, Flags); + SDValue C64 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Op2, Flags); + + // Prefer a wide FMA node if available; otherwise expand to mul+add. + SDValue WideRes; + if (TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), MVT::f64)) { + WideRes = DAG.getNode(ISD::FMA, dl, MVT::f64, A64, B64, C64, Flags); + } else { + SDValue Mul = DAG.getNode(ISD::FMUL, dl, MVT::f64, A64, B64, Flags); + WideRes = DAG.getNode(ISD::FADD, dl, MVT::f64, Mul, C64, Flags); + } - // Convert back to FP16 as an integer. + return DAG.getNode(GetPromotionOpcode(MVT::f64, OVT), dl, MVT::i16, + WideRes); + } + + Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1, Op2, Flags); return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res); } diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 1d674b283db15..d62bf8a4ad74d 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -842,6 +842,11 @@ void TargetLoweringBase::initActions() { } } + // If f16 fma is not natively supported, the value must be promoted to an f64 + // (and not to f32!) to prevent double rounding issues. + AddPromotedToType(ISD::FMA, MVT::f16, MVT::f64); + AddPromotedToType(ISD::STRICT_FMA, MVT::f16, MVT::f64); + // Set default actions for various operations. for (MVT VT : MVT::all_valuetypes()) { // Default all indexed load / store to expand. diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index be53f51afe79f..c32056e396dcc 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -572,6 +572,15 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, XLenVT, Custom); } + if (!Subtarget.hasStdExtD()) { + // FIXME: handle f16 fma when f64 is not legal. Using an f32 fma + // instruction runs into double rounding issues, so this is wrong. + // Normally we'd use an f64 fma, but without the D extension the f64 type + // is not legal. This should probably be a libcall. + AddPromotedToType(ISD::FMA, MVT::f16, MVT::f32); + AddPromotedToType(ISD::STRICT_FMA, MVT::f16, MVT::f32); + } + setOperationAction(ISD::BITCAST, MVT::i16, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Legal); diff --git a/llvm/test/CodeGen/AArch64/f16-instructions.ll b/llvm/test/CodeGen/AArch64/f16-instructions.ll index 085170c7ba381..f6d701b518699 100644 --- a/llvm/test/CodeGen/AArch64/f16-instructions.ll +++ b/llvm/test/CodeGen/AArch64/f16-instructions.ll @@ -1378,11 +1378,11 @@ define half @test_log2(half %a) #0 { define half @test_fma(half %a, half %b, half %c) #0 { ; CHECK-CVT-SD-LABEL: test_fma: ; CHECK-CVT-SD: // %bb.0: -; CHECK-CVT-SD-NEXT: fcvt s2, h2 -; CHECK-CVT-SD-NEXT: fcvt s1, h1 -; CHECK-CVT-SD-NEXT: fcvt s0, h0 -; CHECK-CVT-SD-NEXT: fmadd s0, s0, s1, s2 -; CHECK-CVT-SD-NEXT: fcvt h0, s0 +; CHECK-CVT-SD-NEXT: fcvt d2, h2 +; CHECK-CVT-SD-NEXT: fcvt d1, h1 +; CHECK-CVT-SD-NEXT: fcvt d0, h0 +; CHECK-CVT-SD-NEXT: fmadd d0, d0, d1, d2 +; CHECK-CVT-SD-NEXT: fcvt h0, d0 ; CHECK-CVT-SD-NEXT: ret ; ; CHECK-FP16-LABEL: test_fma: diff --git a/llvm/test/CodeGen/AArch64/fmla.ll b/llvm/test/CodeGen/AArch64/fmla.ll index 12b6562b5cf0c..16c835b59bd3f 100644 --- a/llvm/test/CodeGen/AArch64/fmla.ll +++ b/llvm/test/CodeGen/AArch64/fmla.ll @@ -27,11 +27,11 @@ entry: define half @fma_f16(half %a, half %b, half %c) { ; CHECK-SD-NOFP16-LABEL: fma_f16: ; CHECK-SD-NOFP16: // %bb.0: // %entry -; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 -; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 -; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 -; CHECK-SD-NOFP16-NEXT: fmadd s0, s0, s1, s2 -; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 +; CHECK-SD-NOFP16-NEXT: fcvt d2, h2 +; CHECK-SD-NOFP16-NEXT: fcvt d1, h1 +; CHECK-SD-NOFP16-NEXT: fcvt d0, h0 +; CHECK-SD-NOFP16-NEXT: fmadd d0, d0, d1, d2 +; CHECK-SD-NOFP16-NEXT: fcvt h0, d0 ; CHECK-SD-NOFP16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fma_f16: @@ -178,69 +178,69 @@ define <7 x half> @fma_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) { ; CHECK-SD-NOFP16-NEXT: mov h3, v2.h[1] ; CHECK-SD-NOFP16-NEXT: mov h4, v1.h[1] ; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[1] -; CHECK-SD-NOFP16-NEXT: fcvt s6, h2 -; CHECK-SD-NOFP16-NEXT: fcvt s7, h1 -; CHECK-SD-NOFP16-NEXT: fcvt s16, h0 +; CHECK-SD-NOFP16-NEXT: fcvt d6, h2 +; CHECK-SD-NOFP16-NEXT: fcvt d7, h1 +; CHECK-SD-NOFP16-NEXT: fcvt d16, h0 ; CHECK-SD-NOFP16-NEXT: mov h17, v2.h[2] ; CHECK-SD-NOFP16-NEXT: mov h18, v1.h[2] ; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[2] -; CHECK-SD-NOFP16-NEXT: fcvt s3, h3 -; CHECK-SD-NOFP16-NEXT: fcvt s4, h4 -; CHECK-SD-NOFP16-NEXT: fcvt s5, h5 -; CHECK-SD-NOFP16-NEXT: fmadd s6, s16, s7, s6 -; CHECK-SD-NOFP16-NEXT: fcvt s7, h17 -; CHECK-SD-NOFP16-NEXT: fcvt s16, h18 -; CHECK-SD-NOFP16-NEXT: fcvt s17, h19 +; CHECK-SD-NOFP16-NEXT: fcvt d3, h3 +; CHECK-SD-NOFP16-NEXT: fcvt d4, h4 +; CHECK-SD-NOFP16-NEXT: fcvt d5, h5 +; CHECK-SD-NOFP16-NEXT: fmadd d6, d16, d7, d6 +; CHECK-SD-NOFP16-NEXT: fcvt d7, h17 +; CHECK-SD-NOFP16-NEXT: fcvt d16, h18 +; CHECK-SD-NOFP16-NEXT: fcvt d17, h19 ; CHECK-SD-NOFP16-NEXT: mov h18, v1.h[3] ; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[3] -; CHECK-SD-NOFP16-NEXT: fmadd s4, s5, s4, s3 +; CHECK-SD-NOFP16-NEXT: fmadd d4, d5, d4, d3 ; CHECK-SD-NOFP16-NEXT: mov h5, v2.h[3] -; CHECK-SD-NOFP16-NEXT: fcvt h3, s6 -; CHECK-SD-NOFP16-NEXT: fmadd s6, s17, s16, s7 +; CHECK-SD-NOFP16-NEXT: fcvt h3, d6 +; CHECK-SD-NOFP16-NEXT: fmadd d6, d17, d16, d7 ; CHECK-SD-NOFP16-NEXT: mov h17, v2.h[4] -; CHECK-SD-NOFP16-NEXT: fcvt s7, h18 -; CHECK-SD-NOFP16-NEXT: fcvt s16, h19 +; CHECK-SD-NOFP16-NEXT: fcvt d7, h18 +; CHECK-SD-NOFP16-NEXT: fcvt d16, h19 ; CHECK-SD-NOFP16-NEXT: mov h18, v1.h[4] -; CHECK-SD-NOFP16-NEXT: fcvt h4, s4 -; CHECK-SD-NOFP16-NEXT: fcvt s5, h5 +; CHECK-SD-NOFP16-NEXT: fcvt h4, d4 +; CHECK-SD-NOFP16-NEXT: fcvt d5, h5 ; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[4] -; CHECK-SD-NOFP16-NEXT: fcvt h6, s6 -; CHECK-SD-NOFP16-NEXT: fcvt s17, h17 -; CHECK-SD-NOFP16-NEXT: fcvt s18, h18 +; CHECK-SD-NOFP16-NEXT: fcvt h6, d6 +; CHECK-SD-NOFP16-NEXT: fcvt d17, h17 +; CHECK-SD-NOFP16-NEXT: fcvt d18, h18 ; CHECK-SD-NOFP16-NEXT: mov v3.h[1], v4.h[0] ; CHECK-SD-NOFP16-NEXT: mov h4, v2.h[5] -; CHECK-SD-NOFP16-NEXT: fmadd s5, s16, s7, s5 +; CHECK-SD-NOFP16-NEXT: fmadd d5, d16, d7, d5 ; CHECK-SD-NOFP16-NEXT: mov h7, v1.h[5] ; CHECK-SD-NOFP16-NEXT: mov h16, v0.h[5] -; CHECK-SD-NOFP16-NEXT: fcvt s19, h19 +; CHECK-SD-NOFP16-NEXT: fcvt d19, h19 ; CHECK-SD-NOFP16-NEXT: mov v3.h[2], v6.h[0] ; CHECK-SD-NOFP16-NEXT: mov h6, v2.h[6] -; CHECK-SD-NOFP16-NEXT: fcvt s4, h4 -; CHECK-SD-NOFP16-NEXT: fcvt s7, h7 -; CHECK-SD-NOFP16-NEXT: fcvt s16, h16 -; CHECK-SD-NOFP16-NEXT: fcvt h5, s5 -; CHECK-SD-NOFP16-NEXT: fmadd s17, s19, s18, s17 +; CHECK-SD-NOFP16-NEXT: fcvt d4, h4 +; CHECK-SD-NOFP16-NEXT: fcvt d7, h7 +; CHECK-SD-NOFP16-NEXT: fcvt d16, h16 +; CHECK-SD-NOFP16-NEXT: fcvt h5, d5 +; CHECK-SD-NOFP16-NEXT: fmadd d17, d19, d18, d17 ; CHECK-SD-NOFP16-NEXT: mov h18, v1.h[6] ; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[6] ; CHECK-SD-NOFP16-NEXT: mov h2, v2.h[7] ; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7] ; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7] -; CHECK-SD-NOFP16-NEXT: fmadd s4, s16, s7, s4 +; CHECK-SD-NOFP16-NEXT: fmadd d4, d16, d7, d4 ; CHECK-SD-NOFP16-NEXT: mov v3.h[3], v5.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt s5, h6 -; CHECK-SD-NOFP16-NEXT: fcvt s6, h18 -; CHECK-SD-NOFP16-NEXT: fcvt s7, h19 -; CHECK-SD-NOFP16-NEXT: fcvt h16, s17 -; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 -; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 -; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 -; CHECK-SD-NOFP16-NEXT: fcvt h4, s4 -; CHECK-SD-NOFP16-NEXT: fmadd s5, s7, s6, s5 +; CHECK-SD-NOFP16-NEXT: fcvt d5, h6 +; CHECK-SD-NOFP16-NEXT: fcvt d6, h18 +; CHECK-SD-NOFP16-NEXT: fcvt d7, h19 +; CHECK-SD-NOFP16-NEXT: fcvt h16, d17 +; CHECK-SD-NOFP16-NEXT: fcvt d2, h2 +; CHECK-SD-NOFP16-NEXT: fcvt d1, h1 +; CHECK-SD-NOFP16-NEXT: fcvt d0, h0 +; CHECK-SD-NOFP16-NEXT: fcvt h4, d4 +; CHECK-SD-NOFP16-NEXT: fmadd d5, d7, d6, d5 ; CHECK-SD-NOFP16-NEXT: mov v3.h[4], v16.h[0] -; CHECK-SD-NOFP16-NEXT: fmadd s0, s0, s1, s2 +; CHECK-SD-NOFP16-NEXT: fmadd d0, d0, d1, d2 ; CHECK-SD-NOFP16-NEXT: mov v3.h[5], v4.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt h4, s5 -; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 +; CHECK-SD-NOFP16-NEXT: fcvt h4, d5 +; CHECK-SD-NOFP16-NEXT: fcvt h0, d0 ; CHECK-SD-NOFP16-NEXT: mov v3.h[6], v4.h[0] ; CHECK-SD-NOFP16-NEXT: mov v3.h[7], v0.h[0] ; CHECK-SD-NOFP16-NEXT: mov v0.16b, v3.16b @@ -301,34 +301,34 @@ define <4 x half> @fma_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) { ; CHECK-SD-NOFP16-NEXT: mov h3, v2.h[1] ; CHECK-SD-NOFP16-NEXT: mov h4, v1.h[1] ; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[1] -; CHECK-SD-NOFP16-NEXT: fcvt s6, h2 -; CHECK-SD-NOFP16-NEXT: fcvt s7, h1 -; CHECK-SD-NOFP16-NEXT: fcvt s16, h0 +; CHECK-SD-NOFP16-NEXT: fcvt d6, h2 +; CHECK-SD-NOFP16-NEXT: fcvt d7, h1 +; CHECK-SD-NOFP16-NEXT: fcvt d16, h0 ; CHECK-SD-NOFP16-NEXT: mov h17, v2.h[2] ; CHECK-SD-NOFP16-NEXT: mov h18, v1.h[2] ; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[2] ; CHECK-SD-NOFP16-NEXT: mov h2, v2.h[3] ; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[3] -; CHECK-SD-NOFP16-NEXT: fcvt s3, h3 -; CHECK-SD-NOFP16-NEXT: fcvt s4, h4 -; CHECK-SD-NOFP16-NEXT: fcvt s5, h5 -; CHECK-SD-NOFP16-NEXT: fmadd s6, s16, s7, s6 +; CHECK-SD-NOFP16-NEXT: fcvt d3, h3 +; CHECK-SD-NOFP16-NEXT: fcvt d4, h4 +; CHECK-SD-NOFP16-NEXT: fcvt d5, h5 +; CHECK-SD-NOFP16-NEXT: fmadd d6, d16, d7, d6 ; CHECK-SD-NOFP16-NEXT: mov h16, v0.h[3] -; CHECK-SD-NOFP16-NEXT: fcvt s7, h19 -; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 -; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 -; CHECK-SD-NOFP16-NEXT: fmadd s3, s5, s4, s3 -; CHECK-SD-NOFP16-NEXT: fcvt s4, h17 -; CHECK-SD-NOFP16-NEXT: fcvt s5, h18 -; CHECK-SD-NOFP16-NEXT: fcvt h0, s6 -; CHECK-SD-NOFP16-NEXT: fmadd s4, s7, s5, s4 -; CHECK-SD-NOFP16-NEXT: fcvt h3, s3 -; CHECK-SD-NOFP16-NEXT: fcvt s5, h16 +; CHECK-SD-NOFP16-NEXT: fcvt d7, h19 +; CHECK-SD-NOFP16-NEXT: fcvt d2, h2 +; CHECK-SD-NOFP16-NEXT: fcvt d1, h1 +; CHECK-SD-NOFP16-NEXT: fmadd d3, d5, d4, d3 +; CHECK-SD-NOFP16-NEXT: fcvt d4, h17 +; CHECK-SD-NOFP16-NEXT: fcvt d5, h18 +; CHECK-SD-NOFP16-NEXT: fcvt h0, d6 +; CHECK-SD-NOFP16-NEXT: fmadd d4, d7, d5, d4 +; CHECK-SD-NOFP16-NEXT: fcvt h3, d3 +; CHECK-SD-NOFP16-NEXT: fcvt d5, h16 ; CHECK-SD-NOFP16-NEXT: mov v0.h[1], v3.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt h3, s4 -; CHECK-SD-NOFP16-NEXT: fmadd s1, s5, s1, s2 +; CHECK-SD-NOFP16-NEXT: fcvt h3, d4 +; CHECK-SD-NOFP16-NEXT: fmadd d1, d5, d1, d2 ; CHECK-SD-NOFP16-NEXT: mov v0.h[2], v3.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt h1, s1 +; CHECK-SD-NOFP16-NEXT: fcvt h1, d1 ; CHECK-SD-NOFP16-NEXT: mov v0.h[3], v1.h[0] ; CHECK-SD-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-SD-NOFP16-NEXT: ret @@ -364,69 +364,69 @@ define <8 x half> @fma_v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) { ; CHECK-SD-NOFP16-NEXT: mov h3, v2.h[1] ; CHECK-SD-NOFP16-NEXT: mov h4, v1.h[1] ; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[1] -; CHECK-SD-NOFP16-NEXT: fcvt s6, h2 -; CHECK-SD-NOFP16-NEXT: fcvt s7, h1 -; CHECK-SD-NOFP16-NEXT: fcvt s16, h0 +; CHECK-SD-NOFP16-NEXT: fcvt d6, h2 +; CHECK-SD-NOFP16-NEXT: fcvt d7, h1 +; CHECK-SD-NOFP16-NEXT: fcvt d16, h0 ; CHECK-SD-NOFP16-NEXT: mov h17, v2.h[2] ; CHECK-SD-NOFP16-NEXT: mov h18, v1.h[2] ; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[2] -; CHECK-SD-NOFP16-NEXT: fcvt s3, h3 -; CHECK-SD-NOFP16-NEXT: fcvt s4, h4 -; CHECK-SD-NOFP16-NEXT: fcvt s5, h5 -; CHECK-SD-NOFP16-NEXT: fmadd s6, s16, s7, s6 -; CHECK-SD-NOFP16-NEXT: fcvt s7, h17 -; CHECK-SD-NOFP16-NEXT: fcvt s16, h18 -; CHECK-SD-NOFP16-NEXT: fcvt s17, h19 +; CHECK-SD-NOFP16-NEXT: fcvt d3, h3 +; CHECK-SD-NOFP16-NEXT: fcvt d4, h4 +; CHECK-SD-NOFP16-NEXT: fcvt d5, h5 +; CHECK-SD-NOFP16-NEXT: fmadd d6, d16, d7, d6 +; CHECK-SD-NOFP16-NEXT: fcvt d7, h17 +; CHECK-SD-NOFP16-NEXT: fcvt d16, h18 +; CHECK-SD-NOFP16-NEXT: fcvt d17, h19 ; CHECK-SD-NOFP16-NEXT: mov h18, v1.h[3] ; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[3] -; CHECK-SD-NOFP16-NEXT: fmadd s4, s5, s4, s3 +; CHECK-SD-NOFP16-NEXT: fmadd d4, d5, d4, d3 ; CHECK-SD-NOFP16-NEXT: mov h5, v2.h[3] -; CHECK-SD-NOFP16-NEXT: fcvt h3, s6 -; CHECK-SD-NOFP16-NEXT: fmadd s6, s17, s16, s7 +; CHECK-SD-NOFP16-NEXT: fcvt h3, d6 +; CHECK-SD-NOFP16-NEXT: fmadd d6, d17, d16, d7 ; CHECK-SD-NOFP16-NEXT: mov h17, v2.h[4] -; CHECK-SD-NOFP16-NEXT: fcvt s7, h18 -; CHECK-SD-NOFP16-NEXT: fcvt s16, h19 +; CHECK-SD-NOFP16-NEXT: fcvt d7, h18 +; CHECK-SD-NOFP16-NEXT: fcvt d16, h19 ; CHECK-SD-NOFP16-NEXT: mov h18, v1.h[4] -; CHECK-SD-NOFP16-NEXT: fcvt h4, s4 -; CHECK-SD-NOFP16-NEXT: fcvt s5, h5 +; CHECK-SD-NOFP16-NEXT: fcvt h4, d4 +; CHECK-SD-NOFP16-NEXT: fcvt d5, h5 ; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[4] -; CHECK-SD-NOFP16-NEXT: fcvt h6, s6 -; CHECK-SD-NOFP16-NEXT: fcvt s17, h17 -; CHECK-SD-NOFP16-NEXT: fcvt s18, h18 +; CHECK-SD-NOFP16-NEXT: fcvt h6, d6 +; CHECK-SD-NOFP16-NEXT: fcvt d17, h17 +; CHECK-SD-NOFP16-NEXT: fcvt d18, h18 ; CHECK-SD-NOFP16-NEXT: mov v3.h[1], v4.h[0] ; CHECK-SD-NOFP16-NEXT: mov h4, v2.h[5] -; CHECK-SD-NOFP16-NEXT: fmadd s5, s16, s7, s5 +; CHECK-SD-NOFP16-NEXT: fmadd d5, d16, d7, d5 ; CHECK-SD-NOFP16-NEXT: mov h7, v1.h[5] ; CHECK-SD-NOFP16-NEXT: mov h16, v0.h[5] -; CHECK-SD-NOFP16-NEXT: fcvt s19, h19 +; CHECK-SD-NOFP16-NEXT: fcvt d19, h19 ; CHECK-SD-NOFP16-NEXT: mov v3.h[2], v6.h[0] ; CHECK-SD-NOFP16-NEXT: mov h6, v2.h[6] -; CHECK-SD-NOFP16-NEXT: fcvt s4, h4 -; CHECK-SD-NOFP16-NEXT: fcvt s7, h7 -; CHECK-SD-NOFP16-NEXT: fcvt s16, h16 -; CHECK-SD-NOFP16-NEXT: fcvt h5, s5 -; CHECK-SD-NOFP16-NEXT: fmadd s17, s19, s18, s17 +; CHECK-SD-NOFP16-NEXT: fcvt d4, h4 +; CHECK-SD-NOFP16-NEXT: fcvt d7, h7 +; CHECK-SD-NOFP16-NEXT: fcvt d16, h16 +; CHECK-SD-NOFP16-NEXT: fcvt h5, d5 +; CHECK-SD-NOFP16-NEXT: fmadd d17, d19, d18, d17 ; CHECK-SD-NOFP16-NEXT: mov h18, v1.h[6] ; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[6] ; CHECK-SD-NOFP16-NEXT: mov h2, v2.h[7] ; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7] ; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7] -; CHECK-SD-NOFP16-NEXT: fmadd s4, s16, s7, s4 +; CHECK-SD-NOFP16-NEXT: fmadd d4, d16, d7, d4 ; CHECK-SD-NOFP16-NEXT: mov v3.h[3], v5.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt s5, h6 -; CHECK-SD-NOFP16-NEXT: fcvt s6, h18 -; CHECK-SD-NOFP16-NEXT: fcvt s7, h19 -; CHECK-SD-NOFP16-NEXT: fcvt h16, s17 -; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 -; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 -; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 -; CHECK-SD-NOFP16-NEXT: fcvt h4, s4 -; CHECK-SD-NOFP16-NEXT: fmadd s5, s7, s6, s5 +; CHECK-SD-NOFP16-NEXT: fcvt d5, h6 +; CHECK-SD-NOFP16-NEXT: fcvt d6, h18 +; CHECK-SD-NOFP16-NEXT: fcvt d7, h19 +; CHECK-SD-NOFP16-NEXT: fcvt h16, d17 +; CHECK-SD-NOFP16-NEXT: fcvt d2, h2 +; CHECK-SD-NOFP16-NEXT: fcvt d1, h1 +; CHECK-SD-NOFP16-NEXT: fcvt d0, h0 +; CHECK-SD-NOFP16-NEXT: fcvt h4, d4 +; CHECK-SD-NOFP16-NEXT: fmadd d5, d7, d6, d5 ; CHECK-SD-NOFP16-NEXT: mov v3.h[4], v16.h[0] -; CHECK-SD-NOFP16-NEXT: fmadd s0, s0, s1, s2 +; CHECK-SD-NOFP16-NEXT: fmadd d0, d0, d1, d2 ; CHECK-SD-NOFP16-NEXT: mov v3.h[5], v4.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt h4, s5 -; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 +; CHECK-SD-NOFP16-NEXT: fcvt h4, d5 +; CHECK-SD-NOFP16-NEXT: fcvt h0, d0 ; CHECK-SD-NOFP16-NEXT: mov v3.h[6], v4.h[0] ; CHECK-SD-NOFP16-NEXT: mov v3.h[7], v0.h[0] ; CHECK-SD-NOFP16-NEXT: mov v0.16b, v3.16b @@ -468,136 +468,136 @@ define <16 x half> @fma_v16f16(<16 x half> %a, <16 x half> %b, <16 x half> %c) { ; CHECK-SD-NOFP16-NEXT: mov h6, v4.h[1] ; CHECK-SD-NOFP16-NEXT: mov h7, v2.h[1] ; CHECK-SD-NOFP16-NEXT: mov h16, v0.h[1] -; CHECK-SD-NOFP16-NEXT: fcvt s17, h4 -; CHECK-SD-NOFP16-NEXT: fcvt s18, h2 -; CHECK-SD-NOFP16-NEXT: fcvt s19, h0 +; CHECK-SD-NOFP16-NEXT: fcvt d17, h4 +; CHECK-SD-NOFP16-NEXT: fcvt d18, h2 +; CHECK-SD-NOFP16-NEXT: fcvt d19, h0 ; CHECK-SD-NOFP16-NEXT: mov h20, v4.h[2] ; CHECK-SD-NOFP16-NEXT: mov h21, v2.h[2] ; CHECK-SD-NOFP16-NEXT: mov h22, v0.h[2] ; CHECK-SD-NOFP16-NEXT: mov h23, v4.h[3] ; CHECK-SD-NOFP16-NEXT: mov h24, v2.h[3] ; CHECK-SD-NOFP16-NEXT: mov h25, v0.h[3] -; CHECK-SD-NOFP16-NEXT: fcvt s6, h6 -; CHECK-SD-NOFP16-NEXT: fcvt s7, h7 -; CHECK-SD-NOFP16-NEXT: fcvt s16, h16 -; CHECK-SD-NOFP16-NEXT: fmadd s17, s19, s18, s17 +; CHECK-SD-NOFP16-NEXT: fcvt d6, h6 +; CHECK-SD-NOFP16-NEXT: fcvt d7, h7 +; CHECK-SD-NOFP16-NEXT: fcvt d16, h16 +; CHECK-SD-NOFP16-NEXT: fmadd d17, d19, d18, d17 ; CHECK-SD-NOFP16-NEXT: mov h26, v1.h[1] -; CHECK-SD-NOFP16-NEXT: fcvt s27, h5 -; CHECK-SD-NOFP16-NEXT: fcvt s18, h20 -; CHECK-SD-NOFP16-NEXT: fcvt s19, h21 -; CHECK-SD-NOFP16-NEXT: fcvt s20, h22 -; CHECK-SD-NOFP16-NEXT: fcvt s21, h23 -; CHECK-SD-NOFP16-NEXT: fcvt s22, h24 -; CHECK-SD-NOFP16-NEXT: fcvt s23, h25 -; CHECK-SD-NOFP16-NEXT: fmadd s7, s16, s7, s6 +; CHECK-SD-NOFP16-NEXT: fcvt d27, h5 +; CHECK-SD-NOFP16-NEXT: fcvt d18, h20 +; CHECK-SD-NOFP16-NEXT: fcvt d19, h21 +; CHECK-SD-NOFP16-NEXT: fcvt d20, h22 +; CHECK-SD-NOFP16-NEXT: fcvt d21, h23 +; CHECK-SD-NOFP16-NEXT: fcvt d22, h24 +; CHECK-SD-NOFP16-NEXT: fcvt d23, h25 +; CHECK-SD-NOFP16-NEXT: fmadd d7, d16, d7, d6 ; CHECK-SD-NOFP16-NEXT: mov h24, v5.h[1] ; CHECK-SD-NOFP16-NEXT: mov h25, v3.h[1] -; CHECK-SD-NOFP16-NEXT: fcvt h6, s17 -; CHECK-SD-NOFP16-NEXT: fcvt s28, h3 -; CHECK-SD-NOFP16-NEXT: fcvt s29, h1 -; CHECK-SD-NOFP16-NEXT: fmadd s19, s20, s19, s18 -; CHECK-SD-NOFP16-NEXT: fcvt s26, h26 +; CHECK-SD-NOFP16-NEXT: fcvt h6, d17 +; CHECK-SD-NOFP16-NEXT: fcvt d28, h3 +; CHECK-SD-NOFP16-NEXT: fcvt d29, h1 +; CHECK-SD-NOFP16-NEXT: fmadd d19, d20, d19, d18 +; CHECK-SD-NOFP16-NEXT: fcvt d26, h26 ; CHECK-SD-NOFP16-NEXT: mov h16, v4.h[4] -; CHECK-SD-NOFP16-NEXT: fmadd s21, s23, s22, s21 +; CHECK-SD-NOFP16-NEXT: fmadd d21, d23, d22, d21 ; CHECK-SD-NOFP16-NEXT: mov h22, v3.h[2] ; CHECK-SD-NOFP16-NEXT: mov h23, v1.h[2] -; CHECK-SD-NOFP16-NEXT: fcvt h20, s7 -; CHECK-SD-NOFP16-NEXT: fcvt s24, h24 -; CHECK-SD-NOFP16-NEXT: fcvt s25, h25 +; CHECK-SD-NOFP16-NEXT: fcvt h20, d7 +; CHECK-SD-NOFP16-NEXT: fcvt d24, h24 +; CHECK-SD-NOFP16-NEXT: fcvt d25, h25 ; CHECK-SD-NOFP16-NEXT: mov h17, v2.h[4] ; CHECK-SD-NOFP16-NEXT: mov h18, v0.h[4] ; CHECK-SD-NOFP16-NEXT: mov h7, v4.h[5] -; CHECK-SD-NOFP16-NEXT: fcvt h19, s19 +; CHECK-SD-NOFP16-NEXT: fcvt h19, d19 ; CHECK-SD-NOFP16-NEXT: mov h30, v2.h[5] -; CHECK-SD-NOFP16-NEXT: fcvt s16, h16 -; CHECK-SD-NOFP16-NEXT: fcvt h21, s21 +; CHECK-SD-NOFP16-NEXT: fcvt d16, h16 +; CHECK-SD-NOFP16-NEXT: fcvt h21, d21 ; CHECK-SD-NOFP16-NEXT: mov h31, v1.h[4] -; CHECK-SD-NOFP16-NEXT: fmadd s24, s26, s25, s24 -; CHECK-SD-NOFP16-NEXT: fmadd s25, s29, s28, s27 +; CHECK-SD-NOFP16-NEXT: fmadd d24, d26, d25, d24 +; CHECK-SD-NOFP16-NEXT: fmadd d25, d29, d28, d27 ; CHECK-SD-NOFP16-NEXT: mov v6.h[1], v20.h[0] ; CHECK-SD-NOFP16-NEXT: mov h20, v5.h[2] ; CHECK-SD-NOFP16-NEXT: mov h26, v5.h[3] ; CHECK-SD-NOFP16-NEXT: mov h27, v3.h[3] ; CHECK-SD-NOFP16-NEXT: mov h28, v1.h[3] -; CHECK-SD-NOFP16-NEXT: fcvt s17, h17 -; CHECK-SD-NOFP16-NEXT: fcvt s18, h18 -; CHECK-SD-NOFP16-NEXT: fcvt s29, h7 -; CHECK-SD-NOFP16-NEXT: fcvt s30, h30 +; CHECK-SD-NOFP16-NEXT: fcvt d17, h17 +; CHECK-SD-NOFP16-NEXT: fcvt d18, h18 +; CHECK-SD-NOFP16-NEXT: fcvt d29, h7 +; CHECK-SD-NOFP16-NEXT: fcvt d30, h30 ; CHECK-SD-NOFP16-NEXT: mov v6.h[2], v19.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt h24, s24 -; CHECK-SD-NOFP16-NEXT: fcvt h7, s25 -; CHECK-SD-NOFP16-NEXT: fcvt s19, h20 -; CHECK-SD-NOFP16-NEXT: fcvt s20, h22 -; CHECK-SD-NOFP16-NEXT: fcvt s22, h23 -; CHECK-SD-NOFP16-NEXT: fmadd s16, s18, s17, s16 +; CHECK-SD-NOFP16-NEXT: fcvt h24, d24 +; CHECK-SD-NOFP16-NEXT: fcvt h7, d25 +; CHECK-SD-NOFP16-NEXT: fcvt d19, h20 +; CHECK-SD-NOFP16-NEXT: fcvt d20, h22 +; CHECK-SD-NOFP16-NEXT: fcvt d22, h23 +; CHECK-SD-NOFP16-NEXT: fmadd d16, d18, d17, d16 ; CHECK-SD-NOFP16-NEXT: mov h23, v0.h[5] -; CHECK-SD-NOFP16-NEXT: fcvt s25, h26 -; CHECK-SD-NOFP16-NEXT: fcvt s26, h27 -; CHECK-SD-NOFP16-NEXT: fcvt s27, h28 +; CHECK-SD-NOFP16-NEXT: fcvt d25, h26 +; CHECK-SD-NOFP16-NEXT: fcvt d26, h27 +; CHECK-SD-NOFP16-NEXT: fcvt d27, h28 ; CHECK-SD-NOFP16-NEXT: mov h18, v4.h[6] ; CHECK-SD-NOFP16-NEXT: mov v6.h[3], v21.h[0] ; CHECK-SD-NOFP16-NEXT: mov v7.h[1], v24.h[0] ; CHECK-SD-NOFP16-NEXT: mov h24, v5.h[5] -; CHECK-SD-NOFP16-NEXT: fmadd s19, s22, s20, s19 +; CHECK-SD-NOFP16-NEXT: fmadd d19, d22, d20, d19 ; CHECK-SD-NOFP16-NEXT: mov h20, v5.h[4] ; CHECK-SD-NOFP16-NEXT: mov h22, v3.h[4] -; CHECK-SD-NOFP16-NEXT: fcvt s23, h23 +; CHECK-SD-NOFP16-NEXT: fcvt d23, h23 ; CHECK-SD-NOFP16-NEXT: mov h28, v0.h[6] -; CHECK-SD-NOFP16-NEXT: fcvt h16, s16 -; CHECK-SD-NOFP16-NEXT: fcvt s18, h18 +; CHECK-SD-NOFP16-NEXT: fcvt h16, d16 +; CHECK-SD-NOFP16-NEXT: fcvt d18, h18 ; CHECK-SD-NOFP16-NEXT: mov h4, v4.h[7] ; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7] -; CHECK-SD-NOFP16-NEXT: fcvt s20, h20 -; CHECK-SD-NOFP16-NEXT: fcvt s21, h22 -; CHECK-SD-NOFP16-NEXT: fcvt s22, h31 -; CHECK-SD-NOFP16-NEXT: fmadd s17, s23, s30, s29 -; CHECK-SD-NOFP16-NEXT: fmadd s23, s27, s26, s25 -; CHECK-SD-NOFP16-NEXT: fcvt h19, s19 +; CHECK-SD-NOFP16-NEXT: fcvt d20, h20 +; CHECK-SD-NOFP16-NEXT: fcvt d21, h22 +; CHECK-SD-NOFP16-NEXT: fcvt d22, h31 +; CHECK-SD-NOFP16-NEXT: fmadd d17, d23, d30, d29 +; CHECK-SD-NOFP16-NEXT: fmadd d23, d27, d26, d25 +; CHECK-SD-NOFP16-NEXT: fcvt h19, d19 ; CHECK-SD-NOFP16-NEXT: mov h25, v3.h[5] ; CHECK-SD-NOFP16-NEXT: mov h26, v1.h[5] ; CHECK-SD-NOFP16-NEXT: mov h27, v2.h[6] ; CHECK-SD-NOFP16-NEXT: mov h29, v1.h[6] ; CHECK-SD-NOFP16-NEXT: mov h2, v2.h[7] ; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7] -; CHECK-SD-NOFP16-NEXT: fmadd s20, s22, s21, s20 +; CHECK-SD-NOFP16-NEXT: fmadd d20, d22, d21, d20 ; CHECK-SD-NOFP16-NEXT: mov h21, v5.h[6] ; CHECK-SD-NOFP16-NEXT: mov h22, v3.h[6] ; CHECK-SD-NOFP16-NEXT: mov v7.h[2], v19.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt h19, s23 -; CHECK-SD-NOFP16-NEXT: fcvt s23, h24 -; CHECK-SD-NOFP16-NEXT: fcvt s24, h25 -; CHECK-SD-NOFP16-NEXT: fcvt s25, h26 -; CHECK-SD-NOFP16-NEXT: fcvt s26, h27 -; CHECK-SD-NOFP16-NEXT: fcvt s27, h28 -; CHECK-SD-NOFP16-NEXT: fcvt s28, h29 +; CHECK-SD-NOFP16-NEXT: fcvt h19, d23 +; CHECK-SD-NOFP16-NEXT: fcvt d23, h24 +; CHECK-SD-NOFP16-NEXT: fcvt d24, h25 +; CHECK-SD-NOFP16-NEXT: fcvt d25, h26 +; CHECK-SD-NOFP16-NEXT: fcvt d26, h27 +; CHECK-SD-NOFP16-NEXT: fcvt d27, h28 +; CHECK-SD-NOFP16-NEXT: fcvt d28, h29 ; CHECK-SD-NOFP16-NEXT: mov h5, v5.h[7] -; CHECK-SD-NOFP16-NEXT: fcvt s21, h21 -; CHECK-SD-NOFP16-NEXT: fcvt s22, h22 +; CHECK-SD-NOFP16-NEXT: fcvt d21, h21 +; CHECK-SD-NOFP16-NEXT: fcvt d22, h22 ; CHECK-SD-NOFP16-NEXT: mov h3, v3.h[7] ; CHECK-SD-NOFP16-NEXT: mov v7.h[3], v19.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt h19, s20 +; CHECK-SD-NOFP16-NEXT: fcvt h19, d20 ; CHECK-SD-NOFP16-NEXT: mov v6.h[4], v16.h[0] -; CHECK-SD-NOFP16-NEXT: fmadd s20, s25, s24, s23 -; CHECK-SD-NOFP16-NEXT: fcvt h16, s17 -; CHECK-SD-NOFP16-NEXT: fcvt s4, h4 -; CHECK-SD-NOFP16-NEXT: fmadd s18, s27, s26, s18 -; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 -; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 -; CHECK-SD-NOFP16-NEXT: fmadd s21, s28, s22, s21 -; CHECK-SD-NOFP16-NEXT: fcvt s5, h5 -; CHECK-SD-NOFP16-NEXT: fcvt s3, h3 +; CHECK-SD-NOFP16-NEXT: fmadd d20, d25, d24, d23 +; CHECK-SD-NOFP16-NEXT: fcvt h16, d17 +; CHECK-SD-NOFP16-NEXT: fcvt d4, h4 +; CHECK-SD-NOFP16-NEXT: fmadd d18, d27, d26, d18 +; CHECK-SD-NOFP16-NEXT: fcvt d2, h2 +; CHECK-SD-NOFP16-NEXT: fcvt d0, h0 +; CHECK-SD-NOFP16-NEXT: fmadd d21, d28, d22, d21 +; CHECK-SD-NOFP16-NEXT: fcvt d5, h5 +; CHECK-SD-NOFP16-NEXT: fcvt d3, h3 ; CHECK-SD-NOFP16-NEXT: mov v7.h[4], v19.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 -; CHECK-SD-NOFP16-NEXT: fcvt h17, s20 +; CHECK-SD-NOFP16-NEXT: fcvt d1, h1 +; CHECK-SD-NOFP16-NEXT: fcvt h17, d20 ; CHECK-SD-NOFP16-NEXT: mov v6.h[5], v16.h[0] -; CHECK-SD-NOFP16-NEXT: fmadd s0, s0, s2, s4 -; CHECK-SD-NOFP16-NEXT: fcvt h2, s18 -; CHECK-SD-NOFP16-NEXT: fcvt h4, s21 -; CHECK-SD-NOFP16-NEXT: fmadd s1, s1, s3, s5 +; CHECK-SD-NOFP16-NEXT: fmadd d0, d0, d2, d4 +; CHECK-SD-NOFP16-NEXT: fcvt h2, d18 +; CHECK-SD-NOFP16-NEXT: fcvt h4, d21 +; CHECK-SD-NOFP16-NEXT: fmadd d1, d1, d3, d5 ; CHECK-SD-NOFP16-NEXT: mov v7.h[5], v17.h[0] ; CHECK-SD-NOFP16-NEXT: mov v6.h[6], v2.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 -; CHECK-SD-NOFP16-NEXT: fcvt h1, s1 +; CHECK-SD-NOFP16-NEXT: fcvt h0, d0 +; CHECK-SD-NOFP16-NEXT: fcvt h1, d1 ; CHECK-SD-NOFP16-NEXT: mov v7.h[6], v4.h[0] ; CHECK-SD-NOFP16-NEXT: mov v6.h[7], v0.h[0] ; CHECK-SD-NOFP16-NEXT: mov v7.h[7], v1.h[0] diff --git a/llvm/test/CodeGen/AArch64/fp-intrinsics-fp16.ll b/llvm/test/CodeGen/AArch64/fp-intrinsics-fp16.ll index 86029a7169abb..368fa0a0cfae9 100644 --- a/llvm/test/CodeGen/AArch64/fp-intrinsics-fp16.ll +++ b/llvm/test/CodeGen/AArch64/fp-intrinsics-fp16.ll @@ -170,11 +170,11 @@ define half @frem_f16(half %x, half %y) #0 { define half @fma_f16(half %x, half %y, half %z) #0 { ; CHECK-NOFP16-LABEL: fma_f16: ; CHECK-NOFP16: // %bb.0: -; CHECK-NOFP16-NEXT: fcvt s2, h2 -; CHECK-NOFP16-NEXT: fcvt s1, h1 -; CHECK-NOFP16-NEXT: fcvt s0, h0 -; CHECK-NOFP16-NEXT: fmadd s0, s0, s1, s2 -; CHECK-NOFP16-NEXT: fcvt h0, s0 +; CHECK-NOFP16-NEXT: fcvt d2, h2 +; CHECK-NOFP16-NEXT: fcvt d1, h1 +; CHECK-NOFP16-NEXT: fcvt d0, h0 +; CHECK-NOFP16-NEXT: fmadd d0, d0, d1, d2 +; CHECK-NOFP16-NEXT: fcvt h0, d0 ; CHECK-NOFP16-NEXT: ret ; ; CHECK-FP16-LABEL: fma_f16: @@ -1382,3 +1382,5 @@ declare i1 @llvm.experimental.constrained.fcmp.f16(half, half, metadata, metadat declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata) declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK-GI: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll index 20c06f0a1aff5..2f708cbda1f2b 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll @@ -1043,38 +1043,38 @@ define <2 x half> @fma_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x half> %op3) ; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] ; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] ; NONEON-NOSVE-NEXT: ldr h2, [sp, #6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 ; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] ; NONEON-NOSVE-NEXT: ldr h2, [sp, #4] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #30] ; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 ; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] ; NONEON-NOSVE-NEXT: ldr h2, [sp, #2] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #28] ; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 ; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] ; NONEON-NOSVE-NEXT: ldr h2, [sp] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #26] ; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #24] ; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] ; NONEON-NOSVE-NEXT: add sp, sp, #32 @@ -1103,38 +1103,38 @@ define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) ; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] ; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] ; NONEON-NOSVE-NEXT: ldr h2, [sp, #6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 ; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] ; NONEON-NOSVE-NEXT: ldr h2, [sp, #4] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #30] ; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 ; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] ; NONEON-NOSVE-NEXT: ldr h2, [sp, #2] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #28] ; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 ; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] ; NONEON-NOSVE-NEXT: ldr h2, [sp] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #26] ; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #24] ; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] ; NONEON-NOSVE-NEXT: add sp, sp, #32 @@ -1163,74 +1163,74 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) ; NONEON-NOSVE-NEXT: ldr h0, [sp, #46] ; NONEON-NOSVE-NEXT: ldr h1, [sp, #30] ; NONEON-NOSVE-NEXT: ldr h2, [sp, #14] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 ; NONEON-NOSVE-NEXT: ldr h1, [sp, #28] ; NONEON-NOSVE-NEXT: ldr h2, [sp, #12] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #62] ; NONEON-NOSVE-NEXT: ldr h0, [sp, #44] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 ; NONEON-NOSVE-NEXT: ldr h1, [sp, #26] ; NONEON-NOSVE-NEXT: ldr h2, [sp, #10] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #60] ; NONEON-NOSVE-NEXT: ldr h0, [sp, #42] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 ; NONEON-NOSVE-NEXT: ldr h1, [sp, #24] ; NONEON-NOSVE-NEXT: ldr h2, [sp, #8] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #58] ; NONEON-NOSVE-NEXT: ldr h0, [sp, #40] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 ; NONEON-NOSVE-NEXT: ldr h1, [sp, #22] ; NONEON-NOSVE-NEXT: ldr h2, [sp, #6] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #56] ; NONEON-NOSVE-NEXT: ldr h0, [sp, #38] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 ; NONEON-NOSVE-NEXT: ldr h1, [sp, #20] ; NONEON-NOSVE-NEXT: ldr h2, [sp, #4] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #54] ; NONEON-NOSVE-NEXT: ldr h0, [sp, #36] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 ; NONEON-NOSVE-NEXT: ldr h1, [sp, #18] ; NONEON-NOSVE-NEXT: ldr h2, [sp, #2] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #52] ; NONEON-NOSVE-NEXT: ldr h0, [sp, #34] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 ; NONEON-NOSVE-NEXT: ldr h1, [sp, #16] ; NONEON-NOSVE-NEXT: ldr h2, [sp] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #50] ; NONEON-NOSVE-NEXT: ldr h0, [sp, #32] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #48] ; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: add sp, sp, #64 @@ -1264,146 +1264,146 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) { ; NONEON-NOSVE-NEXT: stp q1, q5, [sp, #32] ; NONEON-NOSVE-NEXT: ldr h1, [sp, #78] ; NONEON-NOSVE-NEXT: ldr h2, [sp, #62] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 ; NONEON-NOSVE-NEXT: ldr h1, [sp, #76] ; NONEON-NOSVE-NEXT: ldr h2, [sp, #60] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #126] ; NONEON-NOSVE-NEXT: ldr h0, [sp, #92] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 ; NONEON-NOSVE-NEXT: ldr h1, [sp, #74] ; NONEON-NOSVE-NEXT: ldr h2, [sp, #58] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #124] ; NONEON-NOSVE-NEXT: ldr h0, [sp, #90] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 ; NONEON-NOSVE-NEXT: ldr h1, [sp, #72] ; NONEON-NOSVE-NEXT: ldr h2, [sp, #56] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #122] ; NONEON-NOSVE-NEXT: ldr h0, [sp, #88] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 ; NONEON-NOSVE-NEXT: ldr h1, [sp, #70] ; NONEON-NOSVE-NEXT: ldr h2, [sp, #54] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #120] ; NONEON-NOSVE-NEXT: ldr h0, [sp, #86] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 ; NONEON-NOSVE-NEXT: ldr h1, [sp, #68] ; NONEON-NOSVE-NEXT: ldr h2, [sp, #52] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #118] ; NONEON-NOSVE-NEXT: ldr h0, [sp, #84] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 ; NONEON-NOSVE-NEXT: ldr h1, [sp, #66] ; NONEON-NOSVE-NEXT: ldr h2, [sp, #50] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #116] ; NONEON-NOSVE-NEXT: ldr h0, [sp, #82] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 ; NONEON-NOSVE-NEXT: ldr h1, [sp, #64] ; NONEON-NOSVE-NEXT: ldr h2, [sp, #48] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #114] ; NONEON-NOSVE-NEXT: ldr h0, [sp, #80] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 ; NONEON-NOSVE-NEXT: ldr h1, [sp, #30] ; NONEON-NOSVE-NEXT: ldr h2, [sp, #14] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #112] ; NONEON-NOSVE-NEXT: ldr h0, [sp, #46] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 ; NONEON-NOSVE-NEXT: ldr h1, [sp, #28] ; NONEON-NOSVE-NEXT: ldr h2, [sp, #12] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #110] ; NONEON-NOSVE-NEXT: ldr h0, [sp, #44] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 ; NONEON-NOSVE-NEXT: ldr h1, [sp, #26] ; NONEON-NOSVE-NEXT: ldr h2, [sp, #10] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #108] ; NONEON-NOSVE-NEXT: ldr h0, [sp, #42] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 ; NONEON-NOSVE-NEXT: ldr h1, [sp, #24] ; NONEON-NOSVE-NEXT: ldr h2, [sp, #8] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #106] ; NONEON-NOSVE-NEXT: ldr h0, [sp, #40] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 ; NONEON-NOSVE-NEXT: ldr h1, [sp, #22] ; NONEON-NOSVE-NEXT: ldr h2, [sp, #6] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #104] ; NONEON-NOSVE-NEXT: ldr h0, [sp, #38] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 ; NONEON-NOSVE-NEXT: ldr h1, [sp, #20] ; NONEON-NOSVE-NEXT: ldr h2, [sp, #4] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #102] ; NONEON-NOSVE-NEXT: ldr h0, [sp, #36] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 ; NONEON-NOSVE-NEXT: ldr h1, [sp, #18] ; NONEON-NOSVE-NEXT: ldr h2, [sp, #2] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #100] ; NONEON-NOSVE-NEXT: ldr h0, [sp, #34] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 ; NONEON-NOSVE-NEXT: ldr h1, [sp, #16] ; NONEON-NOSVE-NEXT: ldr h2, [sp] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d1, h1 +; NONEON-NOSVE-NEXT: fcvt d2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #98] ; NONEON-NOSVE-NEXT: ldr h0, [sp, #32] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 -; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 +; NONEON-NOSVE-NEXT: fcvt h0, d0 ; NONEON-NOSVE-NEXT: str h0, [sp, #96] ; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] diff --git a/llvm/test/CodeGen/ARM/fp16-promote.ll b/llvm/test/CodeGen/ARM/fp16-promote.ll index 8230e47259dd8..27a0bf2eb9037 100644 --- a/llvm/test/CodeGen/ARM/fp16-promote.ll +++ b/llvm/test/CodeGen/ARM/fp16-promote.ll @@ -1508,61 +1508,81 @@ define void @test_fma(ptr %p, ptr %q, ptr %r) #0 { ; CHECK-FP16-NEXT: push {r4, lr} ; CHECK-FP16-NEXT: mov r4, r0 ; CHECK-FP16-NEXT: ldrh r0, [r1] -; CHECK-FP16-NEXT: ldrh r1, [r4] -; CHECK-FP16-NEXT: ldrh r2, [r2] -; CHECK-FP16-NEXT: vmov s2, r0 +; CHECK-FP16-NEXT: ldrh r1, [r2] +; CHECK-FP16-NEXT: vmov s0, r0 +; CHECK-FP16-NEXT: ldrh r0, [r4] +; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-FP16-NEXT: vcvt.f64.f32 d16, s0 +; CHECK-FP16-NEXT: vmov s0, r0 +; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-FP16-NEXT: vcvt.f64.f32 d17, s0 ; CHECK-FP16-NEXT: vmov s0, r1 -; CHECK-FP16-NEXT: vcvtb.f32.f16 s1, s2 -; CHECK-FP16-NEXT: vmov s2, r2 ; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-FP16-NEXT: vcvtb.f32.f16 s2, s2 -; CHECK-FP16-NEXT: bl fmaf -; CHECK-FP16-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-FP16-NEXT: vmov r0, s0 +; CHECK-FP16-NEXT: vcvt.f64.f32 d18, s0 +; CHECK-FP16-NEXT: vmla.f64 d18, d17, d16 +; CHECK-FP16-NEXT: vmov r0, r1, d18 +; CHECK-FP16-NEXT: bl __aeabi_d2h ; CHECK-FP16-NEXT: strh r0, [r4] ; CHECK-FP16-NEXT: pop {r4, pc} ; ; CHECK-LIBCALL-VFP-LABEL: test_fma: ; CHECK-LIBCALL-VFP: .save {r4, r5, r6, lr} ; CHECK-LIBCALL-VFP-NEXT: push {r4, r5, r6, lr} +; CHECK-LIBCALL-VFP-NEXT: .vsave {d8, d9} +; CHECK-LIBCALL-VFP-NEXT: vpush {d8, d9} ; CHECK-LIBCALL-VFP-NEXT: mov r4, r0 -; CHECK-LIBCALL-VFP-NEXT: ldrh r0, [r2] -; CHECK-LIBCALL-VFP-NEXT: mov r5, r1 +; CHECK-LIBCALL-VFP-NEXT: ldrh r0, [r0] +; CHECK-LIBCALL-VFP-NEXT: mov r5, r2 +; CHECK-LIBCALL-VFP-NEXT: mov r6, r1 ; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_h2f -; CHECK-LIBCALL-VFP-NEXT: mov r6, r0 -; CHECK-LIBCALL-VFP-NEXT: ldrh r0, [r5] +; CHECK-LIBCALL-VFP-NEXT: ldrh r1, [r6] +; CHECK-LIBCALL-VFP-NEXT: vmov s16, r0 +; CHECK-LIBCALL-VFP-NEXT: ldrh r5, [r5] +; CHECK-LIBCALL-VFP-NEXT: mov r0, r1 ; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_h2f -; CHECK-LIBCALL-VFP-NEXT: mov r5, r0 -; CHECK-LIBCALL-VFP-NEXT: ldrh r0, [r4] +; CHECK-LIBCALL-VFP-NEXT: vmov s18, r0 +; CHECK-LIBCALL-VFP-NEXT: mov r0, r5 ; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_h2f ; CHECK-LIBCALL-VFP-NEXT: vmov s0, r0 -; CHECK-LIBCALL-VFP-NEXT: vmov s1, r5 -; CHECK-LIBCALL-VFP-NEXT: vmov s2, r6 -; CHECK-LIBCALL-VFP-NEXT: bl fmaf -; CHECK-LIBCALL-VFP-NEXT: vmov r0, s0 -; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_f2h +; CHECK-LIBCALL-VFP-NEXT: vcvt.f64.f32 d16, s18 +; CHECK-LIBCALL-VFP-NEXT: vcvt.f64.f32 d17, s16 +; CHECK-LIBCALL-VFP-NEXT: vcvt.f64.f32 d18, s0 +; CHECK-LIBCALL-VFP-NEXT: vmla.f64 d18, d17, d16 +; CHECK-LIBCALL-VFP-NEXT: vmov r0, r1, d18 +; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_d2h ; CHECK-LIBCALL-VFP-NEXT: strh r0, [r4] +; CHECK-LIBCALL-VFP-NEXT: vpop {d8, d9} ; CHECK-LIBCALL-VFP-NEXT: pop {r4, r5, r6, pc} ; ; CHECK-NOVFP-LABEL: test_fma: -; CHECK-NOVFP: .save {r4, r5, r6, lr} -; CHECK-NOVFP-NEXT: push {r4, r5, r6, lr} +; CHECK-NOVFP: .save {r4, r5, r6, r7, r11, lr} +; CHECK-NOVFP-NEXT: push {r4, r5, r6, r7, r11, lr} ; CHECK-NOVFP-NEXT: mov r4, r0 ; CHECK-NOVFP-NEXT: ldrh r0, [r1] ; CHECK-NOVFP-NEXT: mov r5, r2 ; CHECK-NOVFP-NEXT: bl __aeabi_h2f +; CHECK-NOVFP-NEXT: bl __aeabi_f2d ; CHECK-NOVFP-NEXT: mov r6, r0 -; CHECK-NOVFP-NEXT: ldrh r0, [r5] -; CHECK-NOVFP-NEXT: bl __aeabi_h2f -; CHECK-NOVFP-NEXT: mov r5, r0 ; CHECK-NOVFP-NEXT: ldrh r0, [r4] +; CHECK-NOVFP-NEXT: mov r7, r1 ; CHECK-NOVFP-NEXT: bl __aeabi_h2f -; CHECK-NOVFP-NEXT: mov r1, r6 -; CHECK-NOVFP-NEXT: mov r2, r5 -; CHECK-NOVFP-NEXT: bl fmaf -; CHECK-NOVFP-NEXT: bl __aeabi_f2h +; CHECK-NOVFP-NEXT: bl __aeabi_f2d +; CHECK-NOVFP-NEXT: mov r2, r6 +; CHECK-NOVFP-NEXT: mov r3, r7 +; CHECK-NOVFP-NEXT: bl __aeabi_dmul +; CHECK-NOVFP-NEXT: mov r6, r0 +; CHECK-NOVFP-NEXT: ldrh r0, [r5] +; CHECK-NOVFP-NEXT: mov r7, r1 +; CHECK-NOVFP-NEXT: bl __aeabi_h2f +; CHECK-NOVFP-NEXT: bl __aeabi_f2d +; CHECK-NOVFP-NEXT: mov r2, r0 +; CHECK-NOVFP-NEXT: mov r3, r1 +; CHECK-NOVFP-NEXT: mov r0, r6 +; CHECK-NOVFP-NEXT: mov r1, r7 +; CHECK-NOVFP-NEXT: bl __aeabi_dadd +; CHECK-NOVFP-NEXT: bl __aeabi_d2h ; CHECK-NOVFP-NEXT: strh r0, [r4] -; CHECK-NOVFP-NEXT: pop {r4, r5, r6, pc} +; CHECK-NOVFP-NEXT: pop {r4, r5, r6, r7, r11, pc} %a = load half, ptr %p, align 2 %b = load half, ptr %q, align 2 %c = load half, ptr %r, align 2 diff --git a/llvm/test/CodeGen/Generic/half-op.ll b/llvm/test/CodeGen/Generic/half-op.ll index 1037d8e20cc10..f8ad39f9456aa 100644 --- a/llvm/test/CodeGen/Generic/half-op.ll +++ b/llvm/test/CodeGen/Generic/half-op.ll @@ -8,39 +8,39 @@ ; RUN: %if aarch64-registered-target %{ llc %s -o - -mtriple=arm64ec-pc-windows-msvc | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %} ; RUN: %if amdgpu-registered-target %{ llc %s -o - -mtriple=amdgcn-amd-amdhsa | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %} ; RUN: %if arc-registered-target %{ llc %s -o - -mtriple=arc-elf | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} -; RUN: %if arm-registered-target %{ llc %s -o - -mtriple=arm-unknown-linux-gnueabi | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} -; RUN: %if arm-registered-target %{ llc %s -o - -mtriple=thumbv7em-none-eabi | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} -; RUN: %if avr-registered-target %{ llc %s -o - -mtriple=avr-none | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if arm-registered-target %{ llc %s -o - -mtriple=arm-unknown-linux-gnueabi | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN %} +; RUN: %if arm-registered-target %{ llc %s -o - -mtriple=thumbv7em-none-eabi | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if avr-registered-target %{ llc %s -o - -mtriple=avr-none | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %} ; FIXME: BPF has a compiler error ; RUN: %if csky-registered-target %{ llc %s -o - -mtriple=csky-unknown-linux-gnuabiv2 | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} ; RUN: %if csky-registered-target %{ llc %s -o - -mtriple=csky-unknown-linux-gnuabiv2 -mcpu=ck860fv -mattr=+hard-float | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} ; FIXME: directx has a compiler error -; RUN: %if hexagon-registered-target %{ llc %s -o - -mtriple=hexagon-unknown-linux-musl | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if hexagon-registered-target %{ llc %s -o - -mtriple=hexagon-unknown-linux-musl | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %} ; RUN: %if lanai-registered-target %{ llc %s -o - -mtriple=lanai-unknown-unknown | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} -; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch32-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} -; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} -; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu -mattr=+f | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch32-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu -mattr=+f | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %} ; RUN: %if m68k-registered-target %{ llc %s -o - -mtriple=m68k-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} -; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mips-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} -; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mips64-unknown-linux-gnuabi64 | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} -; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mips64el-unknown-linux-gnuabi64 | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} -; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mipsel-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mips-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mips64-unknown-linux-gnuabi64 | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mips64el-unknown-linux-gnuabi64 | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mipsel-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %} ; RUN: %if msp430-registered-target %{ llc %s -o - -mtriple=msp430-none-elf | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} ; RUN: %if nvptx-registered-target %{ llc %s -o - -mtriple=nvptx64-nvidia-cuda | FileCheck %s --check-prefixes=NOCRASH %} ; RUN: %if powerpc-registered-target %{ llc %s -o - -mtriple=powerpc-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} ; RUN: %if powerpc-registered-target %{ llc %s -o - -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} ; RUN: %if powerpc-registered-target %{ llc %s -o - -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} -; RUN: %if riscv-registered-target %{ llc %s -o - -mtriple=riscv32-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} -; RUN: %if riscv-registered-target %{ llc %s -o - -mtriple=riscv64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} -; RUN: %if sparc-registered-target %{ llc %s -o - -mtriple=sparc-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} -; RUN: %if sparc-registered-target %{ llc %s -o - -mtriple=sparc64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if riscv-registered-target %{ llc %s -o - -mtriple=riscv32-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if riscv-registered-target %{ llc %s -o - -mtriple=riscv64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if sparc-registered-target %{ llc %s -o - -mtriple=sparc-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if sparc-registered-target %{ llc %s -o - -mtriple=sparc64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %} ; RUN: %if spirv-registered-target %{ llc %s -o - -mtriple=spirv-unknown-unknown | FileCheck %s --check-prefixes=NOCRASH %} -; RUN: %if systemz-registered-target %{ llc %s -o - -mtriple=s390x-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if systemz-registered-target %{ llc %s -o - -mtriple=s390x-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %} ; RUN: %if ve-registered-target %{ llc %s -o - -mtriple=ve-unknown-unknown | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} ; RUN: %if webassembly-registered-target %{ llc %s -o - -mtriple=wasm32-unknown-unknown | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} -; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=i686-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} -; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=x86_64-pc-windows-msvc | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} -; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=i686-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=x86_64-pc-windows-msvc | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %} ; RUN: %if xcore-registered-target %{ llc %s -o - -mtriple=xcore-unknown-unknown | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} ; RUN: %if xtensa-registered-target %{ llc %s -o - -mtriple=xtensa-none-elf | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,CHECK-FMA %} diff --git a/llvm/test/CodeGen/NVPTX/f16-instructions.ll b/llvm/test/CodeGen/NVPTX/f16-instructions.ll index 4e2f7ea9e5208..53288b35d55a4 100644 --- a/llvm/test/CodeGen/NVPTX/f16-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16-instructions.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; ## Full FP16 support enabled by default. ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ @@ -953,11 +954,11 @@ define half @test_cos(half %a) #0 { ; CHECK-DAG: ld.param.b16 [[C:%rs[0-9]+]], [test_fma_param_2]; ; CHECK-F16-NOFTZ: fma.rn.f16 [[R:%rs[0-9]+]], [[A]], [[B]], [[C]]; ; CHECK-F16-FTZ: fma.rn.ftz.f16 [[R:%rs[0-9]+]], [[A]], [[B]], [[C]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%r[0-9]+]], [[A]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%r[0-9]+]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[C32:%r[0-9]+]], [[C]] -; CHECK-NOF16-NEXT: fma.rn.f32 [[R32:%r[0-9]+]], [[A32]], [[B32]], [[C32]]; -; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]] +; CHECK-NOF16-DAG: cvt.f64.f16 [[A64:%rd[0-9]+]], [[A]] +; CHECK-NOF16-DAG: cvt.f64.f16 [[B64:%rd[0-9]+]], [[B]] +; CHECK-NOF16-DAG: cvt.f64.f16 [[C64:%rd[0-9]+]], [[C]] +; CHECK-NOF16-NEXT: fma.rn.f64 [[R64:%rd[0-9]+]], [[A64]], [[B64]], [[C64]]; +; CHECK-NOF16-NEXT: cvt.rn.f16.f64 [[R:%rs[0-9]+]], [[R64]] ; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret define half @test_fma(half %a, half %b, half %c) #0 { @@ -1151,11 +1152,11 @@ define half @test_round(half %a) #0 { ; CHECK-DAG: ld.param.b16 [[C:%rs[0-9]+]], [test_fmuladd_param_2]; ; CHECK-F16-NOFTZ: fma.rn.f16 [[R:%rs[0-9]+]], [[A]], [[B]], [[C]]; ; CHECK-F16-FTZ: fma.rn.ftz.f16 [[R:%rs[0-9]+]], [[A]], [[B]], [[C]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%r[0-9]+]], [[A]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%r[0-9]+]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[C32:%r[0-9]+]], [[C]] -; CHECK-NOF16-NEXT: fma.rn.f32 [[R32:%r[0-9]+]], [[A32]], [[B32]], [[C32]]; -; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]] +; CHECK-NOF16-DAG: cvt.f64.f16 [[A64:%rd[0-9]+]], [[A]] +; CHECK-NOF16-DAG: cvt.f64.f16 [[B64:%rd[0-9]+]], [[B]] +; CHECK-NOF16-DAG: cvt.f64.f16 [[C64:%rd[0-9]+]], [[C]] +; CHECK-NOF16-NEXT: fma.rn.f64 [[R64:%rd[0-9]+]], [[A64]], [[B64]], [[C64]]; +; CHECK-NOF16-NEXT: cvt.rn.f16.f64 [[R:%rs[0-9]+]], [[R64]] ; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; define half @test_fmuladd(half %a, half %b, half %c) #0 { @@ -1183,3 +1184,9 @@ define <2 x half> @test_neg_f16x2(<2 x half> noundef %arg) #0 { } attributes #0 = { nounwind } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} +; CHECK-F16-FTZ: {{.*}} +; CHECK-F16-NOFTZ: {{.*}} +; CHECK-NOF16: {{.*}} +; CHECK-NOFTZ: {{.*}} diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll index e9143d540b047..3ebaf68d4a15f 100644 --- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll @@ -1766,27 +1766,28 @@ define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { ; CHECK-NOF16-LABEL: test_fma( ; CHECK-NOF16: { ; CHECK-NOF16-NEXT: .reg .b16 %rs<9>; -; CHECK-NOF16-NEXT: .reg .b32 %r<13>; +; CHECK-NOF16-NEXT: .reg .b32 %r<5>; +; CHECK-NOF16-NEXT: .reg .b64 %rd<9>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.b32 %r3, [test_fma_param_2]; ; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fma_param_1]; ; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fma_param_0]; ; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: cvt.f64.f16 %rd1, %rs2; ; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs4; +; CHECK-NOF16-NEXT: cvt.f64.f16 %rd2, %rs4; ; CHECK-NOF16-NEXT: mov.b32 {%rs5, %rs6}, %r1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs6; -; CHECK-NOF16-NEXT: fma.rn.f32 %r7, %r6, %r5, %r4; -; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs7, %r7; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs5; -; CHECK-NOF16-NEXT: fma.rn.f32 %r11, %r10, %r9, %r8; -; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs8, %r11; -; CHECK-NOF16-NEXT: mov.b32 %r12, {%rs8, %rs7}; -; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r12; +; CHECK-NOF16-NEXT: cvt.f64.f16 %rd3, %rs6; +; CHECK-NOF16-NEXT: fma.rn.f64 %rd4, %rd3, %rd2, %rd1; +; CHECK-NOF16-NEXT: cvt.rn.f16.f64 %rs7, %rd4; +; CHECK-NOF16-NEXT: cvt.f64.f16 %rd5, %rs1; +; CHECK-NOF16-NEXT: cvt.f64.f16 %rd6, %rs3; +; CHECK-NOF16-NEXT: cvt.f64.f16 %rd7, %rs5; +; CHECK-NOF16-NEXT: fma.rn.f64 %rd8, %rd7, %rd6, %rd5; +; CHECK-NOF16-NEXT: cvt.rn.f16.f64 %rs8, %rd8; +; CHECK-NOF16-NEXT: mov.b32 %r4, {%rs8, %rs7}; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NOF16-NEXT: ret; %r = call <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) ret <2 x half> %r @@ -2203,27 +2204,28 @@ define <2 x half> @test_fmuladd(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 ; CHECK-NOF16-LABEL: test_fmuladd( ; CHECK-NOF16: { ; CHECK-NOF16-NEXT: .reg .b16 %rs<9>; -; CHECK-NOF16-NEXT: .reg .b32 %r<13>; +; CHECK-NOF16-NEXT: .reg .b32 %r<5>; +; CHECK-NOF16-NEXT: .reg .b64 %rd<9>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.b32 %r3, [test_fmuladd_param_2]; ; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fmuladd_param_1]; ; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fmuladd_param_0]; ; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: cvt.f64.f16 %rd1, %rs2; ; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs4; +; CHECK-NOF16-NEXT: cvt.f64.f16 %rd2, %rs4; ; CHECK-NOF16-NEXT: mov.b32 {%rs5, %rs6}, %r1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs6; -; CHECK-NOF16-NEXT: fma.rn.f32 %r7, %r6, %r5, %r4; -; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs7, %r7; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs5; -; CHECK-NOF16-NEXT: fma.rn.f32 %r11, %r10, %r9, %r8; -; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs8, %r11; -; CHECK-NOF16-NEXT: mov.b32 %r12, {%rs8, %rs7}; -; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r12; +; CHECK-NOF16-NEXT: cvt.f64.f16 %rd3, %rs6; +; CHECK-NOF16-NEXT: fma.rn.f64 %rd4, %rd3, %rd2, %rd1; +; CHECK-NOF16-NEXT: cvt.rn.f16.f64 %rs7, %rd4; +; CHECK-NOF16-NEXT: cvt.f64.f16 %rd5, %rs1; +; CHECK-NOF16-NEXT: cvt.f64.f16 %rd6, %rs3; +; CHECK-NOF16-NEXT: cvt.f64.f16 %rd7, %rs5; +; CHECK-NOF16-NEXT: fma.rn.f64 %rd8, %rd7, %rd6, %rd5; +; CHECK-NOF16-NEXT: cvt.rn.f16.f64 %rs8, %rd8; +; CHECK-NOF16-NEXT: mov.b32 %r4, {%rs8, %rs7}; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NOF16-NEXT: ret; %r = call <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) ret <2 x half> %r diff --git a/llvm/test/CodeGen/RISCV/half-arith.ll b/llvm/test/CodeGen/RISCV/half-arith.ll index e1eb860d26591..311905be2ce25 100644 --- a/llvm/test/CodeGen/RISCV/half-arith.ll +++ b/llvm/test/CodeGen/RISCV/half-arith.ll @@ -1093,28 +1093,41 @@ define half @fmadd_h(half %a, half %b, half %c) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 ; RV32I-NEXT: mv s1, a1 ; RV32I-NEXT: lui a1, 16 -; RV32I-NEXT: addi s3, a1, -1 -; RV32I-NEXT: and a0, a0, s3 +; RV32I-NEXT: addi s4, a1, -1 +; RV32I-NEXT: and a0, a0, s4 ; RV32I-NEXT: call __extendhfsf2 +; RV32I-NEXT: call __extendsfdf2 ; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: and a0, s1, s3 +; RV32I-NEXT: mv s3, a1 +; RV32I-NEXT: and a0, s1, s4 ; RV32I-NEXT: call __extendhfsf2 +; RV32I-NEXT: call __extendsfdf2 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: call __muldf3 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: and a0, s0, s3 +; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: and a0, s0, s4 ; RV32I-NEXT: call __extendhfsf2 +; RV32I-NEXT: call __extendsfdf2 ; RV32I-NEXT: mv a2, a0 -; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: mv a1, s1 -; RV32I-NEXT: call fmaf -; RV32I-NEXT: call __truncsfhf2 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s2 +; RV32I-NEXT: call __adddf3 +; RV32I-NEXT: call __truncdfhf2 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret ; @@ -1132,17 +1145,22 @@ define half @fmadd_h(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: addi s3, a1, -1 ; RV64I-NEXT: and a0, a0, s3 ; RV64I-NEXT: call __extendhfsf2 +; RV64I-NEXT: call __extendsfdf2 ; RV64I-NEXT: mv s2, a0 ; RV64I-NEXT: and a0, s1, s3 ; RV64I-NEXT: call __extendhfsf2 +; RV64I-NEXT: call __extendsfdf2 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call __muldf3 ; RV64I-NEXT: mv s1, a0 ; RV64I-NEXT: and a0, s0, s3 ; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: mv a2, a0 -; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: mv a1, s1 -; RV64I-NEXT: call fmaf -; RV64I-NEXT: call __truncsfhf2 +; RV64I-NEXT: call __extendsfdf2 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __adddf3 +; RV64I-NEXT: call __truncdfhf2 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -1194,35 +1212,48 @@ define half @fmsub_h(half %a, half %b, half %c) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a1 ; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s2, a0, -1 -; RV32I-NEXT: and a0, a2, s2 +; RV32I-NEXT: addi s3, a0, -1 +; RV32I-NEXT: and a0, a2, s3 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: li a1, 0 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: call __truncsfhf2 ; RV32I-NEXT: lui a1, 8 -; RV32I-NEXT: xor s3, a0, a1 -; RV32I-NEXT: and a0, s1, s2 +; RV32I-NEXT: xor s4, a0, a1 +; RV32I-NEXT: and a0, s1, s3 ; RV32I-NEXT: call __extendhfsf2 +; RV32I-NEXT: call __extendsfdf2 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: and a0, s0, s2 +; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: and a0, s0, s3 ; RV32I-NEXT: call __extendhfsf2 +; RV32I-NEXT: call __extendsfdf2 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s2 +; RV32I-NEXT: call __muldf3 ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: and a0, s3, s2 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: and a0, s4, s3 ; RV32I-NEXT: call __extendhfsf2 +; RV32I-NEXT: call __extendsfdf2 ; RV32I-NEXT: mv a2, a0 -; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: mv a1, s0 -; RV32I-NEXT: call fmaf -; RV32I-NEXT: call __truncsfhf2 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: call __adddf3 +; RV32I-NEXT: call __truncdfhf2 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret ; @@ -1247,17 +1278,22 @@ define half @fmsub_h(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: xor s3, a0, a1 ; RV64I-NEXT: and a0, s1, s2 ; RV64I-NEXT: call __extendhfsf2 +; RV64I-NEXT: call __extendsfdf2 ; RV64I-NEXT: mv s1, a0 ; RV64I-NEXT: and a0, s0, s2 ; RV64I-NEXT: call __extendhfsf2 +; RV64I-NEXT: call __extendsfdf2 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __muldf3 ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: and a0, s3, s2 ; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: mv a2, a0 -; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: mv a1, s0 -; RV64I-NEXT: call fmaf -; RV64I-NEXT: call __truncsfhf2 +; RV64I-NEXT: call __extendsfdf2 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __adddf3 +; RV64I-NEXT: call __truncdfhf2 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -1329,8 +1365,8 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind { ; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 ; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: lui a1, 16 -; RV32I-NEXT: addi s3, a1, -1 +; RV32I-NEXT: lui s3, 16 +; RV32I-NEXT: addi s3, s3, -1 ; RV32I-NEXT: and a0, a0, s3 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: li a1, 0 @@ -1347,17 +1383,26 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind { ; RV32I-NEXT: xor s4, a0, a1 ; RV32I-NEXT: and a0, s1, s3 ; RV32I-NEXT: call __extendhfsf2 +; RV32I-NEXT: call __extendsfdf2 ; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: mv s1, a1 ; RV32I-NEXT: and a0, s2, s3 ; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: call __extendsfdf2 +; RV32I-NEXT: mv a2, s0 +; RV32I-NEXT: mv a3, s1 +; RV32I-NEXT: call __muldf3 +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: mv s1, a1 ; RV32I-NEXT: and a0, s4, s3 ; RV32I-NEXT: call __extendhfsf2 +; RV32I-NEXT: call __extendsfdf2 ; RV32I-NEXT: mv a2, a0 -; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: mv a1, s0 -; RV32I-NEXT: call fmaf -; RV32I-NEXT: call __truncsfhf2 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: call __adddf3 +; RV32I-NEXT: call __truncdfhf2 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -1378,8 +1423,8 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: sd s4, 0(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a2 ; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: lui a1, 16 -; RV64I-NEXT: addi s3, a1, -1 +; RV64I-NEXT: lui s3, 16 +; RV64I-NEXT: addi s3, s3, -1 ; RV64I-NEXT: and a0, a0, s3 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: li a1, 0 @@ -1396,17 +1441,21 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: xor s4, a0, a1 ; RV64I-NEXT: and a0, s1, s3 ; RV64I-NEXT: call __extendhfsf2 +; RV64I-NEXT: call __extendsfdf2 ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: and a0, s2, s3 ; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: call __extendsfdf2 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldf3 +; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: and a0, s4, s3 ; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: mv a2, a0 -; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: mv a1, s0 -; RV64I-NEXT: call fmaf -; RV64I-NEXT: call __truncsfhf2 +; RV64I-NEXT: call __extendsfdf2 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __adddf3 +; RV64I-NEXT: call __truncdfhf2 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -1491,8 +1540,8 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind { ; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s3, a0, -1 +; RV32I-NEXT: lui s3, 16 +; RV32I-NEXT: addi s3, s3, -1 ; RV32I-NEXT: and a0, a1, s3 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: li a1, 0 @@ -1509,17 +1558,28 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind { ; RV32I-NEXT: xor s4, a0, a1 ; RV32I-NEXT: and a0, s1, s3 ; RV32I-NEXT: call __extendhfsf2 +; RV32I-NEXT: call __extendsfdf2 ; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: mv s1, a1 ; RV32I-NEXT: and a0, s2, s3 ; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: call __extendsfdf2 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: call __muldf3 +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: mv s1, a1 ; RV32I-NEXT: and a0, s4, s3 ; RV32I-NEXT: call __extendhfsf2 +; RV32I-NEXT: call __extendsfdf2 ; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: mv a1, s1 -; RV32I-NEXT: call fmaf -; RV32I-NEXT: call __truncsfhf2 +; RV32I-NEXT: call __adddf3 +; RV32I-NEXT: call __truncdfhf2 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -1540,8 +1600,8 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: sd s4, 0(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a2 ; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lui a0, 16 -; RV64I-NEXT: addi s3, a0, -1 +; RV64I-NEXT: lui s3, 16 +; RV64I-NEXT: addi s3, s3, -1 ; RV64I-NEXT: and a0, a1, s3 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: li a1, 0 @@ -1558,17 +1618,22 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: xor s4, a0, a1 ; RV64I-NEXT: and a0, s1, s3 ; RV64I-NEXT: call __extendhfsf2 +; RV64I-NEXT: call __extendsfdf2 ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: and a0, s2, s3 ; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: call __extendsfdf2 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __muldf3 +; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: and a0, s4, s3 ; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: call __extendsfdf2 +; RV64I-NEXT: mv a1, a0 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s1 -; RV64I-NEXT: call fmaf -; RV64I-NEXT: call __truncsfhf2 +; RV64I-NEXT: call __adddf3 +; RV64I-NEXT: call __truncdfhf2 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -1659,23 +1724,35 @@ define half @fnmadd_h_3(half %a, half %b, half %c) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 ; RV32I-NEXT: mv s1, a1 ; RV32I-NEXT: lui a1, 16 -; RV32I-NEXT: addi s3, a1, -1 -; RV32I-NEXT: and a0, a0, s3 +; RV32I-NEXT: addi s4, a1, -1 +; RV32I-NEXT: and a0, a0, s4 ; RV32I-NEXT: call __extendhfsf2 +; RV32I-NEXT: call __extendsfdf2 ; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: and a0, s1, s3 +; RV32I-NEXT: mv s3, a1 +; RV32I-NEXT: and a0, s1, s4 ; RV32I-NEXT: call __extendhfsf2 +; RV32I-NEXT: call __extendsfdf2 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: call __muldf3 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: and a0, s0, s3 +; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: and a0, s0, s4 ; RV32I-NEXT: call __extendhfsf2 +; RV32I-NEXT: call __extendsfdf2 ; RV32I-NEXT: mv a2, a0 -; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: mv a1, s1 -; RV32I-NEXT: call fmaf -; RV32I-NEXT: call __truncsfhf2 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s2 +; RV32I-NEXT: call __adddf3 +; RV32I-NEXT: call __truncdfhf2 ; RV32I-NEXT: lui a1, 1048568 ; RV32I-NEXT: xor a0, a0, a1 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload @@ -1683,6 +1760,7 @@ define half @fnmadd_h_3(half %a, half %b, half %c) nounwind { ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret ; @@ -1700,17 +1778,22 @@ define half @fnmadd_h_3(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: addi s3, a1, -1 ; RV64I-NEXT: and a0, a0, s3 ; RV64I-NEXT: call __extendhfsf2 +; RV64I-NEXT: call __extendsfdf2 ; RV64I-NEXT: mv s2, a0 ; RV64I-NEXT: and a0, s1, s3 ; RV64I-NEXT: call __extendhfsf2 +; RV64I-NEXT: call __extendsfdf2 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call __muldf3 ; RV64I-NEXT: mv s1, a0 ; RV64I-NEXT: and a0, s0, s3 ; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: mv a2, a0 -; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: mv a1, s1 -; RV64I-NEXT: call fmaf -; RV64I-NEXT: call __truncsfhf2 +; RV64I-NEXT: call __extendsfdf2 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __adddf3 +; RV64I-NEXT: call __truncdfhf2 ; RV64I-NEXT: lui a1, 1048568 ; RV64I-NEXT: xor a0, a0, a1 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload @@ -1779,23 +1862,35 @@ define half @fnmadd_nsz(half %a, half %b, half %c) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 ; RV32I-NEXT: mv s1, a1 ; RV32I-NEXT: lui a1, 16 -; RV32I-NEXT: addi s3, a1, -1 -; RV32I-NEXT: and a0, a0, s3 +; RV32I-NEXT: addi s4, a1, -1 +; RV32I-NEXT: and a0, a0, s4 ; RV32I-NEXT: call __extendhfsf2 +; RV32I-NEXT: call __extendsfdf2 ; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: and a0, s1, s3 +; RV32I-NEXT: mv s3, a1 +; RV32I-NEXT: and a0, s1, s4 ; RV32I-NEXT: call __extendhfsf2 +; RV32I-NEXT: call __extendsfdf2 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: call __muldf3 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: and a0, s0, s3 +; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: and a0, s0, s4 ; RV32I-NEXT: call __extendhfsf2 +; RV32I-NEXT: call __extendsfdf2 ; RV32I-NEXT: mv a2, a0 -; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: mv a1, s1 -; RV32I-NEXT: call fmaf -; RV32I-NEXT: call __truncsfhf2 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s2 +; RV32I-NEXT: call __adddf3 +; RV32I-NEXT: call __truncdfhf2 ; RV32I-NEXT: lui a1, 1048568 ; RV32I-NEXT: xor a0, a0, a1 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload @@ -1803,6 +1898,7 @@ define half @fnmadd_nsz(half %a, half %b, half %c) nounwind { ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret ; @@ -1820,17 +1916,22 @@ define half @fnmadd_nsz(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: addi s3, a1, -1 ; RV64I-NEXT: and a0, a0, s3 ; RV64I-NEXT: call __extendhfsf2 +; RV64I-NEXT: call __extendsfdf2 ; RV64I-NEXT: mv s2, a0 ; RV64I-NEXT: and a0, s1, s3 ; RV64I-NEXT: call __extendhfsf2 +; RV64I-NEXT: call __extendsfdf2 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call __muldf3 ; RV64I-NEXT: mv s1, a0 ; RV64I-NEXT: and a0, s0, s3 ; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: mv a2, a0 -; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: mv a1, s1 -; RV64I-NEXT: call fmaf -; RV64I-NEXT: call __truncsfhf2 +; RV64I-NEXT: call __extendsfdf2 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __adddf3 +; RV64I-NEXT: call __truncdfhf2 ; RV64I-NEXT: lui a1, 1048568 ; RV64I-NEXT: xor a0, a0, a1 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload @@ -1892,34 +1993,46 @@ define half @fnmsub_h(half %a, half %b, half %c) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 ; RV32I-NEXT: mv s1, a1 ; RV32I-NEXT: lui a1, 16 -; RV32I-NEXT: addi s2, a1, -1 -; RV32I-NEXT: and a0, a0, s2 +; RV32I-NEXT: addi s3, a1, -1 +; RV32I-NEXT: and a0, a0, s3 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: li a1, 0 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: call __truncsfhf2 ; RV32I-NEXT: lui a1, 8 -; RV32I-NEXT: xor s3, a0, a1 -; RV32I-NEXT: and a0, s1, s2 +; RV32I-NEXT: xor s4, a0, a1 +; RV32I-NEXT: and a0, s1, s3 ; RV32I-NEXT: call __extendhfsf2 +; RV32I-NEXT: call __extendsfdf2 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: and a0, s0, s2 +; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: and a0, s4, s3 ; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: and a0, s3, s2 +; RV32I-NEXT: call __extendsfdf2 +; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a3, s2 +; RV32I-NEXT: call __muldf3 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: and a0, s0, s3 ; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: mv a1, s1 -; RV32I-NEXT: mv a2, s0 -; RV32I-NEXT: call fmaf -; RV32I-NEXT: call __truncsfhf2 +; RV32I-NEXT: call __extendsfdf2 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s2 +; RV32I-NEXT: call __adddf3 +; RV32I-NEXT: call __truncdfhf2 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret ; @@ -1944,16 +2057,21 @@ define half @fnmsub_h(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: xor s3, a0, a1 ; RV64I-NEXT: and a0, s1, s2 ; RV64I-NEXT: call __extendhfsf2 +; RV64I-NEXT: call __extendsfdf2 ; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: and a0, s0, s2 -; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: and a0, s3, s2 ; RV64I-NEXT: call __extendhfsf2 +; RV64I-NEXT: call __extendsfdf2 ; RV64I-NEXT: mv a1, s1 -; RV64I-NEXT: mv a2, s0 -; RV64I-NEXT: call fmaf -; RV64I-NEXT: call __truncsfhf2 +; RV64I-NEXT: call __muldf3 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: and a0, s0, s2 +; RV64I-NEXT: call __extendhfsf2 +; RV64I-NEXT: call __extendsfdf2 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __adddf3 +; RV64I-NEXT: call __truncdfhf2 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -2020,35 +2138,48 @@ define half @fnmsub_h_2(half %a, half %b, half %c) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 ; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s2, a0, -1 -; RV32I-NEXT: and a0, a1, s2 +; RV32I-NEXT: addi s3, a0, -1 +; RV32I-NEXT: and a0, a1, s3 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: li a1, 0 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: call __truncsfhf2 ; RV32I-NEXT: lui a1, 8 -; RV32I-NEXT: xor s3, a0, a1 -; RV32I-NEXT: and a0, s1, s2 +; RV32I-NEXT: xor s4, a0, a1 +; RV32I-NEXT: and a0, s1, s3 ; RV32I-NEXT: call __extendhfsf2 +; RV32I-NEXT: call __extendsfdf2 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: and a0, s0, s2 +; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: and a0, s4, s3 ; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: and a0, s3, s2 +; RV32I-NEXT: call __extendsfdf2 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s2 +; RV32I-NEXT: call __muldf3 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: and a0, s0, s3 ; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: call __extendsfdf2 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 ; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: mv a2, s0 -; RV32I-NEXT: call fmaf -; RV32I-NEXT: call __truncsfhf2 +; RV32I-NEXT: mv a1, s2 +; RV32I-NEXT: call __adddf3 +; RV32I-NEXT: call __truncdfhf2 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret ; @@ -2073,17 +2204,22 @@ define half @fnmsub_h_2(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: xor s3, a0, a1 ; RV64I-NEXT: and a0, s1, s2 ; RV64I-NEXT: call __extendhfsf2 +; RV64I-NEXT: call __extendsfdf2 ; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: and a0, s0, s2 -; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: and a0, s3, s2 ; RV64I-NEXT: call __extendhfsf2 +; RV64I-NEXT: call __extendsfdf2 ; RV64I-NEXT: mv a1, a0 ; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: mv a2, s0 -; RV64I-NEXT: call fmaf -; RV64I-NEXT: call __truncsfhf2 +; RV64I-NEXT: call __muldf3 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: and a0, s0, s2 +; RV64I-NEXT: call __extendhfsf2 +; RV64I-NEXT: call __extendsfdf2 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __adddf3 +; RV64I-NEXT: call __truncdfhf2 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/half-intrinsics.ll b/llvm/test/CodeGen/RISCV/half-intrinsics.ll index e712bd919b0b1..e98dfbc22b532 100644 --- a/llvm/test/CodeGen/RISCV/half-intrinsics.ll +++ b/llvm/test/CodeGen/RISCV/half-intrinsics.ll @@ -1690,28 +1690,41 @@ define half @fma_f16(half %a, half %b, half %c) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 ; RV32I-NEXT: mv s1, a1 ; RV32I-NEXT: lui a1, 16 -; RV32I-NEXT: addi s3, a1, -1 -; RV32I-NEXT: and a0, a0, s3 +; RV32I-NEXT: addi s4, a1, -1 +; RV32I-NEXT: and a0, a0, s4 ; RV32I-NEXT: call __extendhfsf2 +; RV32I-NEXT: call __extendsfdf2 ; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: and a0, s1, s3 +; RV32I-NEXT: mv s3, a1 +; RV32I-NEXT: and a0, s1, s4 ; RV32I-NEXT: call __extendhfsf2 +; RV32I-NEXT: call __extendsfdf2 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: call __muldf3 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: and a0, s0, s3 +; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: and a0, s0, s4 ; RV32I-NEXT: call __extendhfsf2 +; RV32I-NEXT: call __extendsfdf2 ; RV32I-NEXT: mv a2, a0 -; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: mv a1, s1 -; RV32I-NEXT: call fmaf -; RV32I-NEXT: call __truncsfhf2 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s2 +; RV32I-NEXT: call __adddf3 +; RV32I-NEXT: call __truncdfhf2 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret ; @@ -1729,17 +1742,22 @@ define half @fma_f16(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: addi s3, a1, -1 ; RV64I-NEXT: and a0, a0, s3 ; RV64I-NEXT: call __extendhfsf2 +; RV64I-NEXT: call __extendsfdf2 ; RV64I-NEXT: mv s2, a0 ; RV64I-NEXT: and a0, s1, s3 ; RV64I-NEXT: call __extendhfsf2 +; RV64I-NEXT: call __extendsfdf2 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call __muldf3 ; RV64I-NEXT: mv s1, a0 ; RV64I-NEXT: and a0, s0, s3 ; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: mv a2, a0 -; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: mv a1, s1 -; RV64I-NEXT: call fmaf -; RV64I-NEXT: call __truncsfhf2 +; RV64I-NEXT: call __extendsfdf2 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __adddf3 +; RV64I-NEXT: call __truncdfhf2 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -1748,14 +1766,41 @@ define half @fma_f16(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; CHECKIZFHMIN-LABEL: fma_f16: -; CHECKIZFHMIN: # %bb.0: -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa2 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa1 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa3, fa0 -; CHECKIZFHMIN-NEXT: fmadd.s fa5, fa3, fa4, fa5 -; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 -; CHECKIZFHMIN-NEXT: ret +; RV32IFZFHMIN-LABEL: fma_f16: +; RV32IFZFHMIN: # %bb.0: +; RV32IFZFHMIN-NEXT: fcvt.s.h fa5, fa2 +; RV32IFZFHMIN-NEXT: fcvt.s.h fa4, fa1 +; RV32IFZFHMIN-NEXT: fcvt.s.h fa3, fa0 +; RV32IFZFHMIN-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; RV32IFZFHMIN-NEXT: fcvt.h.s fa0, fa5 +; RV32IFZFHMIN-NEXT: ret +; +; RV64IFZFHMIN-LABEL: fma_f16: +; RV64IFZFHMIN: # %bb.0: +; RV64IFZFHMIN-NEXT: fcvt.s.h fa5, fa2 +; RV64IFZFHMIN-NEXT: fcvt.s.h fa4, fa1 +; RV64IFZFHMIN-NEXT: fcvt.s.h fa3, fa0 +; RV64IFZFHMIN-NEXT: fmadd.s fa5, fa3, fa4, fa5 +; RV64IFZFHMIN-NEXT: fcvt.h.s fa0, fa5 +; RV64IFZFHMIN-NEXT: ret +; +; RV32IDZFHMIN-LABEL: fma_f16: +; RV32IDZFHMIN: # %bb.0: +; RV32IDZFHMIN-NEXT: fcvt.d.h fa5, fa2 +; RV32IDZFHMIN-NEXT: fcvt.d.h fa4, fa1 +; RV32IDZFHMIN-NEXT: fcvt.d.h fa3, fa0 +; RV32IDZFHMIN-NEXT: fmadd.d fa5, fa3, fa4, fa5 +; RV32IDZFHMIN-NEXT: fcvt.h.d fa0, fa5 +; RV32IDZFHMIN-NEXT: ret +; +; RV64IDZFHMIN-LABEL: fma_f16: +; RV64IDZFHMIN: # %bb.0: +; RV64IDZFHMIN-NEXT: fcvt.d.h fa5, fa2 +; RV64IDZFHMIN-NEXT: fcvt.d.h fa4, fa1 +; RV64IDZFHMIN-NEXT: fcvt.d.h fa3, fa0 +; RV64IDZFHMIN-NEXT: fmadd.d fa5, fa3, fa4, fa5 +; RV64IDZFHMIN-NEXT: fcvt.h.d fa0, fa5 +; RV64IDZFHMIN-NEXT: ret ; ; CHECKIZHINXMIN-LABEL: fma_f16: ; CHECKIZHINXMIN: # %bb.0: diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-06.ll b/llvm/test/CodeGen/SystemZ/fp-mul-06.ll index 6b285a49057dc..3b0c63749d378 100644 --- a/llvm/test/CodeGen/SystemZ/fp-mul-06.ll +++ b/llvm/test/CodeGen/SystemZ/fp-mul-06.ll @@ -8,12 +8,12 @@ declare float @llvm.fma.f32(float %f1, float %f2, float %f3) define half @f0(half %f1, half %f2, half %acc) { ; CHECK-LABEL: f0: -; CHECK: brasl %r14, __extendhfsf2@PLT -; CHECK: brasl %r14, __extendhfsf2@PLT -; CHECK: brasl %r14, __extendhfsf2@PLT -; CHECK-SCALAR: maebr %f0, %f9, %f10 -; CHECK-VECTOR: wfmasb %f0, %f0, %f8, %f10 -; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: brasl %r14, __extendhfdf2@PLT +; CHECK: brasl %r14, __extendhfdf2@PLT +; CHECK: brasl %r14, __extendhfdf2@PLT +; CHECK-SCALAR: madbr %f0, %f9, %f10 +; CHECK-VECTOR: wfmadb %f0, %f0, %f8, %f10 +; CHECK: brasl %r14, __truncdfhf2@PLT ; CHECK: br %r14 %res = call half @llvm.fma.f16 (half %f1, half %f2, half %acc) ret half %res diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-08.ll b/llvm/test/CodeGen/SystemZ/fp-mul-08.ll index e739bddd4f18f..542cae41d4745 100644 --- a/llvm/test/CodeGen/SystemZ/fp-mul-08.ll +++ b/llvm/test/CodeGen/SystemZ/fp-mul-08.ll @@ -10,12 +10,12 @@ define half @f0(half %f1, half %f2, half %acc) { ; CHECK-LABEL: f0: ; CHECK-NOT: brasl ; CHECK: lcdfr %f{{[0-9]+}}, %f4 -; CHECK: brasl %r14, __extendhfsf2@PLT -; CHECK: brasl %r14, __extendhfsf2@PLT -; CHECK: brasl %r14, __extendhfsf2@PLT -; CHECK-SCALAR: maebr %f0, %f8, %f10 -; CHECK-VECTOR: wfmasb %f0, %f0, %f8, %f10 -; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: brasl %r14, __extendhfdf2@PLT +; CHECK: brasl %r14, __extendhfdf2@PLT +; CHECK: brasl %r14, __extendhfdf2@PLT +; CHECK-SCALAR: madbr %f0, %f8, %f10 +; CHECK-VECTOR: wfmadb %f0, %f0, %f8, %f10 +; CHECK: brasl %r14, __truncdfhf2@PLT ; CHECK: br %r14 %negacc = fneg half %acc %res = call half @llvm.fma.f16 (half %f1, half %f2, half %negacc) diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-10.ll b/llvm/test/CodeGen/SystemZ/fp-mul-10.ll index 8f2cd23112cd0..0badf2993cca7 100644 --- a/llvm/test/CodeGen/SystemZ/fp-mul-10.ll +++ b/llvm/test/CodeGen/SystemZ/fp-mul-10.ll @@ -25,11 +25,11 @@ define double @f2(double %f1, double %f2, double %acc) { define half @f3_half(half %f1, half %f2, half %acc) { ; CHECK-LABEL: f3_half: -; CHECK: brasl %r14, __extendhfsf2@PLT -; CHECK: brasl %r14, __extendhfsf2@PLT -; CHECK: brasl %r14, __extendhfsf2@PLT -; CHECK: wfmasb %f0, %f0, %f8, %f10 -; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: brasl %r14, __extendhfdf2@PLT +; CHECK: brasl %r14, __extendhfdf2@PLT +; CHECK: brasl %r14, __extendhfdf2@PLT +; CHECK: wfmadb %f0, %f0, %f8, %f10 +; CHECK: brasl %r14, __truncdfhf2@PLT ; CHECK-NOT: brasl ; CHECK: lcdfr %f0, %f0 ; CHECK-NEXT: lmg @@ -52,11 +52,11 @@ define half @f4_half(half %f1, half %f2, half %acc) { ; CHECK-LABEL: f4_half: ; CHECK-NOT: brasl ; CHECK: lcdfr %f0, %f4 -; CHECK: brasl %r14, __extendhfsf2@PLT -; CHECK: brasl %r14, __extendhfsf2@PLT -; CHECK: brasl %r14, __extendhfsf2@PLT -; CHECK: wfmasb %f0, %f0, %f8, %f10 -; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: brasl %r14, __extendhfdf2@PLT +; CHECK: brasl %r14, __extendhfdf2@PLT +; CHECK: brasl %r14, __extendhfdf2@PLT +; CHECK: wfmadb %f0, %f0, %f8, %f10 +; CHECK: brasl %r14, __truncdfhf2@PLT ; CHECK-NOT: brasl ; CHECK: lcdfr %f0, %f0 ; CHECK-NEXT: lmg diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-mul-06.ll b/llvm/test/CodeGen/SystemZ/fp-strict-mul-06.ll index c951c79aeb7c6..05ce53c98db13 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-mul-06.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-mul-06.ll @@ -8,13 +8,13 @@ declare float @llvm.experimental.constrained.fma.f32(float, float, float, metada define half @f0(half %f1, half %f2, half %acc) #0 { ; CHECK-LABEL: f0: -; CHECK: brasl %r14, __extendhfsf2@PLT -; CHECK: brasl %r14, __extendhfsf2@PLT -; CHECK: brasl %r14, __extendhfsf2@PLT -; CHECK-SCALAR: maebr %f10, %f0, %f8 -; CHECK-SCALAR: ler %f0, %f10 -; CHECK-VECTOR: wfmasb %f0, %f0, %f8, %f10 -; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: brasl %r14, __extendhfdf2@PLT +; CHECK: brasl %r14, __extendhfdf2@PLT +; CHECK: brasl %r14, __extendhfdf2@PLT +; CHECK-SCALAR: madbr %f10, %f0, %f8 +; CHECK-SCALAR: ldr %f0, %f10 +; CHECK-VECTOR: wfmadb %f0, %f0, %f8, %f10 +; CHECK: brasl %r14, __truncdfhf2@PLT ; CHECK: br %r14 %res = call half @llvm.experimental.constrained.fma.f16 ( half %f1, half %f2, half %acc, diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll index b013ddad19a95..61a0c4eda8c72 100644 --- a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll @@ -432,8 +432,7 @@ define half @fma_f16(half %a, half %b, half %c) nounwind strictfp { ; SSE2: # %bb.0: ; SSE2-NEXT: subq $24, %rsp ; SSE2-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; SSE2-NEXT: callq __extendhfsf2@PLT ; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload @@ -443,12 +442,17 @@ define half @fma_f16(half %a, half %b, half %c) nounwind strictfp { ; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __extendhfsf2@PLT -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload -; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: callq fmaf@PLT -; SSE2-NEXT: callq __truncsfhf2@PLT +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: cvtss2sd %xmm0, %xmm2 +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtss2sd %xmm0, %xmm1 +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: cvtss2sd %xmm0, %xmm0 +; SSE2-NEXT: callq fma@PLT +; SSE2-NEXT: callq __truncdfhf2@PLT ; SSE2-NEXT: addq $24, %rsp ; SSE2-NEXT: retq ; @@ -460,38 +464,42 @@ define half @fma_f16(half %a, half %b, half %c) nounwind strictfp { ; F16C-NEXT: vpextrw $0, %xmm2, %edx ; F16C-NEXT: movzwl %dx, %edx ; F16C-NEXT: vmovd %edx, %xmm0 -; F16C-NEXT: vcvtph2ps %xmm0, %xmm2 +; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm2 ; F16C-NEXT: movzwl %cx, %ecx ; F16C-NEXT: vmovd %ecx, %xmm0 -; F16C-NEXT: vcvtph2ps %xmm0, %xmm1 +; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm1 ; F16C-NEXT: movzwl %ax, %eax ; F16C-NEXT: vmovd %eax, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; F16C-NEXT: callq fmaf@PLT -; F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; F16C-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; F16C-NEXT: callq fma@PLT +; F16C-NEXT: callq __truncdfhf2@PLT ; F16C-NEXT: popq %rax ; F16C-NEXT: retq ; ; AVX512-LABEL: fma_f16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpextrw $0, %xmm1, %eax -; AVX512-NEXT: vpextrw $0, %xmm0, %ecx +; AVX512-NEXT: pushq %rax +; AVX512-NEXT: vpextrw $0, %xmm0, %eax +; AVX512-NEXT: vpextrw $0, %xmm1, %ecx ; AVX512-NEXT: vpextrw $0, %xmm2, %edx ; AVX512-NEXT: movzwl %dx, %edx ; AVX512-NEXT: vmovd %edx, %xmm0 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm1 ; AVX512-NEXT: movzwl %cx, %ecx -; AVX512-NEXT: vmovd %ecx, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vmovd %ecx, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm2 ; AVX512-NEXT: movzwl %ax, %eax -; AVX512-NEXT: vmovd %eax, %xmm2 -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm0 -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: popq %rax ; AVX512-NEXT: retq ; ; X86-LABEL: fma_f16: diff --git a/llvm/test/CodeGen/X86/fp16-libcalls.ll b/llvm/test/CodeGen/X86/fp16-libcalls.ll index 9b5c45f44acd0..6abdf9a5ba652 100644 --- a/llvm/test/CodeGen/X86/fp16-libcalls.ll +++ b/llvm/test/CodeGen/X86/fp16-libcalls.ll @@ -421,10 +421,13 @@ define void @test_half_fma(half %a0, half %a1, half %a2, ptr %p0) nounwind { ; F16C-NEXT: pushq %rbx ; F16C-NEXT: movq %rdi, %rbx ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm1, %xmm1 +; F16C-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; F16C-NEXT: vcvtph2ps %xmm2, %xmm2 -; F16C-NEXT: callq fmaf@PLT -; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; F16C-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 +; F16C-NEXT: callq fma@PLT +; F16C-NEXT: callq __truncdfhf2@PLT ; F16C-NEXT: vpextrw $0, %xmm0, (%rbx) ; F16C-NEXT: popq %rbx ; F16C-NEXT: retq @@ -440,24 +443,27 @@ define void @test_half_fma(half %a0, half %a1, half %a2, ptr %p0) nounwind { ; X64-NEXT: pushq %rbx ; X64-NEXT: subq $16, %rsp ; X64-NEXT: movq %rdi, %rbx -; X64-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: movaps %xmm2, %xmm0 ; X64-NEXT: callq __extendhfsf2@PLT ; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; X64-NEXT: # xmm0 = mem[0],zero,zero,zero ; X64-NEXT: callq __extendhfsf2@PLT ; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; X64-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; X64-NEXT: # xmm0 = mem[0],zero,zero,zero ; X64-NEXT: callq __extendhfsf2@PLT +; X64-NEXT: cvtss2sd %xmm0, %xmm0 ; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; X64-NEXT: # xmm1 = mem[0],zero,zero,zero +; X64-NEXT: cvtss2sd %xmm1, %xmm1 ; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload ; X64-NEXT: # xmm2 = mem[0],zero,zero,zero -; X64-NEXT: callq fmaf@PLT -; X64-NEXT: callq __truncsfhf2@PLT +; X64-NEXT: cvtss2sd %xmm2, %xmm2 +; X64-NEXT: callq fma@PLT +; X64-NEXT: callq __truncdfhf2@PLT ; X64-NEXT: pextrw $0, %xmm0, %eax ; X64-NEXT: movw %ax, (%rbx) ; X64-NEXT: addq $16, %rsp @@ -467,7 +473,7 @@ define void @test_half_fma(half %a0, half %a1, half %a2, ptr %p0) nounwind { ; X86-LABEL: test_half_fma: ; X86: # %bb.0: ; X86-NEXT: pushl %esi -; X86-NEXT: subl $72, %esp +; X86-NEXT: subl $88, %esp ; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 @@ -487,17 +493,17 @@ define void @test_half_fma(half %a0, half %a1, half %a2, ptr %p0) nounwind { ; X86-NEXT: pextrw $0, %xmm0, %eax ; X86-NEXT: movw %ax, (%esp) ; X86-NEXT: calll __extendhfsf2 -; X86-NEXT: fstps {{[0-9]+}}(%esp) +; X86-NEXT: fstpl {{[0-9]+}}(%esp) ; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload -; X86-NEXT: fstps {{[0-9]+}}(%esp) +; X86-NEXT: fstpl {{[0-9]+}}(%esp) ; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload -; X86-NEXT: fstps (%esp) -; X86-NEXT: calll fmaf -; X86-NEXT: fstps (%esp) -; X86-NEXT: calll __truncsfhf2 +; X86-NEXT: fstpl (%esp) +; X86-NEXT: calll fma +; X86-NEXT: fstpl (%esp) +; X86-NEXT: calll __truncdfhf2 ; X86-NEXT: pextrw $0, %xmm0, %eax ; X86-NEXT: movw %ax, (%esi) -; X86-NEXT: addl $72, %esp +; X86-NEXT: addl $88, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl %res = call half @llvm.fma.half(half %a0, half %a1, half %a2)