diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt index 83f7697a4a2b4..3f2c0f0436393 100644 --- a/compiler-rt/lib/builtins/CMakeLists.txt +++ b/compiler-rt/lib/builtins/CMakeLists.txt @@ -190,6 +190,7 @@ set(GENERIC_SOURCES # We only build BF16 files when "__bf16" is available. set(BF16_SOURCES + extendbfsf2.c truncdfbf2.c truncsfbf2.c ) diff --git a/compiler-rt/lib/builtins/extendbfsf2.c b/compiler-rt/lib/builtins/extendbfsf2.c new file mode 100644 index 0000000000000..1395dd3bdb73f --- /dev/null +++ b/compiler-rt/lib/builtins/extendbfsf2.c @@ -0,0 +1,13 @@ +//===-- lib/extendbfsf2.c - bfloat -> single conversion -----------*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#define SRC_BFLOAT +#define DST_SINGLE +#include "fp_extend_impl.inc" + +COMPILER_RT_ABI float __extendbfsf2(src_t a) { return __extendXfYf2__(a); } diff --git a/compiler-rt/lib/builtins/fp_extend.h b/compiler-rt/lib/builtins/fp_extend.h index 95ea2a7ac4b2c..8f8c04c765059 100644 --- a/compiler-rt/lib/builtins/fp_extend.h +++ b/compiler-rt/lib/builtins/fp_extend.h @@ -81,6 +81,13 @@ static inline int src_rep_t_clz_impl(src_rep_t a) { #define src_rep_t_clz src_rep_t_clz_impl +#elif defined SRC_BFLOAT +typedef __bf16 src_t; +typedef uint16_t src_rep_t; +#define SRC_REP_C UINT16_C +static const int srcSigBits = 7; +#define src_rep_t_clz __builtin_clz + #else #error Source should be half, single, or double precision! #endif // end source precision diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index ad876c5db4509..ef0fec270a43c 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -921,6 +921,8 @@ enum NodeType { /// has native conversions. BF16_TO_FP, FP_TO_BF16, + STRICT_BF16_TO_FP, + STRICT_FP_TO_BF16, /// Perform various unary floating-point operations inspired by libm. For /// FPOWI, the result is undefined if the integer operand doesn't fit into diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index 3130f6c4dce59..d1015630b05d1 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -698,6 +698,8 @@ END_TWO_BYTE_PACK() return false; case ISD::STRICT_FP16_TO_FP: case ISD::STRICT_FP_TO_FP16: + case ISD::STRICT_BF16_TO_FP: + case ISD::STRICT_FP_TO_BF16: #define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ case ISD::STRICT_##DAGN: #include "llvm/IR/ConstrainedOps.def" diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.def b/llvm/include/llvm/IR/RuntimeLibcalls.def index 19dea60bebf9b..5e082769fa974 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.def +++ b/llvm/include/llvm/IR/RuntimeLibcalls.def @@ -304,6 +304,7 @@ HANDLE_LIBCALL(FEGETMODE, "fegetmode") HANDLE_LIBCALL(FESETMODE, "fesetmode") // Conversion +HANDLE_LIBCALL(FPEXT_BF16_F32, "__extendbfsf2") HANDLE_LIBCALL(FPEXT_F32_PPCF128, "__gcc_stoq") HANDLE_LIBCALL(FPEXT_F64_PPCF128, "__gcc_dtoq") HANDLE_LIBCALL(FPEXT_F80_F128, "__extendxftf2") diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index 5f8bf0d448105..d84c2d30e4472 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -541,6 +541,8 @@ def fp_to_sint_sat : SDNode<"ISD::FP_TO_SINT_SAT" , SDTFPToIntSatOp>; def fp_to_uint_sat : SDNode<"ISD::FP_TO_UINT_SAT" , SDTFPToIntSatOp>; def f16_to_fp : SDNode<"ISD::FP16_TO_FP" , SDTIntToFPOp>; def fp_to_f16 : SDNode<"ISD::FP_TO_FP16" , SDTFPToIntOp>; +def bf16_to_fp : SDNode<"ISD::BF16_TO_FP" , SDTIntToFPOp>; +def fp_to_bf16 : SDNode<"ISD::FP_TO_BF16" , SDTFPToIntOp>; def strict_fadd : SDNode<"ISD::STRICT_FADD", SDTFPBinOp, [SDNPHasChain, SDNPCommutative]>; @@ -620,6 +622,11 @@ def strict_f16_to_fp : SDNode<"ISD::STRICT_FP16_TO_FP", def strict_fp_to_f16 : SDNode<"ISD::STRICT_FP_TO_FP16", SDTFPToIntOp, [SDNPHasChain]>; +def strict_bf16_to_fp : SDNode<"ISD::STRICT_BF16_TO_FP", + SDTIntToFPOp, [SDNPHasChain]>; +def strict_fp_to_bf16 : SDNode<"ISD::STRICT_FP_TO_BF16", + SDTFPToIntOp, [SDNPHasChain]>; + def strict_fsetcc : SDNode<"ISD::STRICT_FSETCC", SDTSetCC, [SDNPHasChain]>; def strict_fsetccs : SDNode<"ISD::STRICT_FSETCCS", SDTSetCC, [SDNPHasChain]>; @@ -1591,6 +1598,12 @@ def any_f16_to_fp : PatFrags<(ops node:$src), def any_fp_to_f16 : PatFrags<(ops node:$src), [(fp_to_f16 node:$src), (strict_fp_to_f16 node:$src)]>; +def any_bf16_to_fp : PatFrags<(ops node:$src), + [(bf16_to_fp node:$src), + (strict_bf16_to_fp node:$src)]>; +def any_fp_to_bf16 : PatFrags<(ops node:$src), + [(fp_to_bf16 node:$src), + (strict_fp_to_bf16 node:$src)]>; multiclass binary_atomic_op_ord { def NAME#_monotonic : PatFrag<(ops node:$ptr, node:$val), diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index f5b7752f7ecc8..2dccc45c803a0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1047,6 +1047,7 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { Node->getOperand(0).getValueType()); break; case ISD::STRICT_FP_TO_FP16: + case ISD::STRICT_FP_TO_BF16: case ISD::STRICT_SINT_TO_FP: case ISD::STRICT_UINT_TO_FP: case ISD::STRICT_LRINT: @@ -3645,14 +3646,14 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { DAG.getNode(ISD::FP_EXTEND, dl, Node->getValueType(0), Res)); } break; + case ISD::STRICT_BF16_TO_FP: case ISD::STRICT_FP16_TO_FP: if (Node->getValueType(0) != MVT::f32) { // We can extend to types bigger than f32 in two steps without changing // the result. Since "f16 -> f32" is much more commonly available, give // CodeGen the option of emitting that before resorting to a libcall. - SDValue Res = - DAG.getNode(ISD::STRICT_FP16_TO_FP, dl, {MVT::f32, MVT::Other}, - {Node->getOperand(0), Node->getOperand(1)}); + SDValue Res = DAG.getNode(Node->getOpcode(), dl, {MVT::f32, MVT::Other}, + {Node->getOperand(0), Node->getOperand(1)}); Res = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {Node->getValueType(0), MVT::Other}, {Res.getValue(1), Res}); @@ -4651,6 +4652,16 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { Results.push_back(ExpandLibCall(RTLIB::FPEXT_F16_F32, Node, false).first); } break; + case ISD::STRICT_BF16_TO_FP: + if (Node->getValueType(0) == MVT::f32) { + TargetLowering::MakeLibCallOptions CallOptions; + std::pair Tmp = TLI.makeLibCall( + DAG, RTLIB::FPEXT_BF16_F32, MVT::f32, Node->getOperand(1), + CallOptions, SDLoc(Node), Node->getOperand(0)); + Results.push_back(Tmp.first); + Results.push_back(Tmp.second); + } + break; case ISD::STRICT_FP16_TO_FP: { if (Node->getValueType(0) == MVT::f32) { TargetLowering::MakeLibCallOptions CallOptions; @@ -4792,12 +4803,17 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { break; } case ISD::STRICT_FP_EXTEND: - case ISD::STRICT_FP_TO_FP16: { - RTLIB::Libcall LC = - Node->getOpcode() == ISD::STRICT_FP_TO_FP16 - ? RTLIB::getFPROUND(Node->getOperand(1).getValueType(), MVT::f16) - : RTLIB::getFPEXT(Node->getOperand(1).getValueType(), - Node->getValueType(0)); + case ISD::STRICT_FP_TO_FP16: + case ISD::STRICT_FP_TO_BF16: { + RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; + if (Node->getOpcode() == ISD::STRICT_FP_TO_FP16) + LC = RTLIB::getFPROUND(Node->getOperand(1).getValueType(), MVT::f16); + else if (Node->getOpcode() == ISD::STRICT_FP_TO_BF16) + LC = RTLIB::getFPROUND(Node->getOperand(1).getValueType(), MVT::bf16); + else + LC = RTLIB::getFPEXT(Node->getOperand(1).getValueType(), + Node->getValueType(0)); + assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to legalize as libcall"); TargetLowering::MakeLibCallOptions CallOptions; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index f0a04589fbfdc..3332c02ec7235 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -918,6 +918,7 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) { case ISD::STRICT_FP_TO_FP16: case ISD::FP_TO_FP16: // Same as FP_ROUND for softening purposes case ISD::FP_TO_BF16: + case ISD::STRICT_FP_TO_BF16: case ISD::STRICT_FP_ROUND: case ISD::FP_ROUND: Res = SoftenFloatOp_FP_ROUND(N); break; case ISD::STRICT_FP_TO_SINT: @@ -970,6 +971,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_ROUND(SDNode *N) { assert(N->getOpcode() == ISD::FP_ROUND || N->getOpcode() == ISD::FP_TO_FP16 || N->getOpcode() == ISD::STRICT_FP_TO_FP16 || N->getOpcode() == ISD::FP_TO_BF16 || + N->getOpcode() == ISD::STRICT_FP_TO_BF16 || N->getOpcode() == ISD::STRICT_FP_ROUND); bool IsStrict = N->isStrictFPOpcode(); @@ -980,7 +982,8 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_ROUND(SDNode *N) { if (N->getOpcode() == ISD::FP_TO_FP16 || N->getOpcode() == ISD::STRICT_FP_TO_FP16) FloatRVT = MVT::f16; - else if (N->getOpcode() == ISD::FP_TO_BF16) + else if (N->getOpcode() == ISD::FP_TO_BF16 || + N->getOpcode() == ISD::STRICT_FP_TO_BF16) FloatRVT = MVT::bf16; RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, FloatRVT); @@ -2193,13 +2196,11 @@ static ISD::NodeType GetPromotionOpcodeStrict(EVT OpVT, EVT RetVT) { if (RetVT == MVT::f16) return ISD::STRICT_FP_TO_FP16; - if (OpVT == MVT::bf16) { - // TODO: return ISD::STRICT_BF16_TO_FP; - } + if (OpVT == MVT::bf16) + return ISD::STRICT_BF16_TO_FP; - if (RetVT == MVT::bf16) { - // TODO: return ISD::STRICT_FP_TO_BF16; - } + if (RetVT == MVT::bf16) + return ISD::STRICT_FP_TO_BF16; report_fatal_error("Attempt at an invalid promotion-related conversion"); } @@ -2999,10 +3000,16 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FP_ROUND(SDNode *N) { EVT SVT = N->getOperand(0).getValueType(); if (N->isStrictFPOpcode()) { - assert(RVT == MVT::f16); - SDValue Res = - DAG.getNode(ISD::STRICT_FP_TO_FP16, SDLoc(N), {MVT::i16, MVT::Other}, - {N->getOperand(0), N->getOperand(1)}); + // FIXME: assume we only have two f16 variants for now. + unsigned Opcode; + if (RVT == MVT::f16) + Opcode = ISD::STRICT_FP_TO_FP16; + else if (RVT == MVT::bf16) + Opcode = ISD::STRICT_FP_TO_BF16; + else + llvm_unreachable("unknown half type"); + SDValue Res = DAG.getNode(Opcode, SDLoc(N), {MVT::i16, MVT::Other}, + {N->getOperand(0), N->getOperand(1)}); ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); return Res; } @@ -3192,10 +3199,16 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfOp_FP_EXTEND(SDNode *N) { Op = GetSoftPromotedHalf(N->getOperand(IsStrict ? 1 : 0)); if (IsStrict) { - assert(SVT == MVT::f16); + unsigned Opcode; + if (SVT == MVT::f16) + Opcode = ISD::STRICT_FP16_TO_FP; + else if (SVT == MVT::bf16) + Opcode = ISD::STRICT_BF16_TO_FP; + else + llvm_unreachable("unknown half type"); SDValue Res = - DAG.getNode(ISD::STRICT_FP16_TO_FP, SDLoc(N), - {N->getValueType(0), MVT::Other}, {N->getOperand(0), Op}); + DAG.getNode(Opcode, SDLoc(N), {N->getValueType(0), MVT::Other}, + {N->getOperand(0), Op}); ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); ReplaceValueWith(SDValue(N, 0), Res); return SDValue(); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 6e55acd22bb37..909c669abd120 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -165,6 +165,7 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { case ISD::FP_TO_FP16: Res = PromoteIntRes_FP_TO_FP16_BF16(N); break; + case ISD::STRICT_FP_TO_BF16: case ISD::STRICT_FP_TO_FP16: Res = PromoteIntRes_STRICT_FP_TO_FP16_BF16(N); break; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 0fbd999694f10..18ca17e53dac3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -380,7 +380,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::FP_TO_FP16: return "fp_to_fp16"; case ISD::STRICT_FP_TO_FP16: return "strict_fp_to_fp16"; case ISD::BF16_TO_FP: return "bf16_to_fp"; + case ISD::STRICT_BF16_TO_FP: return "strict_bf16_to_fp"; case ISD::FP_TO_BF16: return "fp_to_bf16"; + case ISD::STRICT_FP_TO_BF16: return "strict_fp_to_bf16"; case ISD::LROUND: return "lround"; case ISD::STRICT_LROUND: return "strict_lround"; case ISD::LLROUND: return "llround"; diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 646c0c345e54e..a2aeb66835b29 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -307,6 +307,9 @@ RTLIB::Libcall RTLIB::getFPEXT(EVT OpVT, EVT RetVT) { } else if (OpVT == MVT::f80) { if (RetVT == MVT::f128) return FPEXT_F80_F128; + } else if (OpVT == MVT::bf16) { + if (RetVT == MVT::f32) + return FPEXT_BF16_F32; } return UNKNOWN_LIBCALL; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 866a2a94a0bfe..b87e3121838dc 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -406,6 +406,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(Op, MVT::f128, Expand); } + for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) { + setOperationAction(ISD::STRICT_FP_TO_BF16, VT, Expand); + setOperationAction(ISD::STRICT_BF16_TO_FP, VT, Expand); + } + for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) { setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand); diff --git a/llvm/test/CodeGen/X86/bfloat-constrained.ll b/llvm/test/CodeGen/X86/bfloat-constrained.ll new file mode 100644 index 0000000000000..0a8c4f20648b0 --- /dev/null +++ b/llvm/test/CodeGen/X86/bfloat-constrained.ll @@ -0,0 +1,181 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefixes=X64 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+avx512bf16,+avx512vl | FileCheck %s --check-prefixes=X64 + +@a = global bfloat 0xR0000, align 2 +@b = global bfloat 0xR0000, align 2 +@c = global bfloat 0xR0000, align 2 + +define float @bfloat_to_float() strictfp { +; X86-LABEL: bfloat_to_float: +; X86: # %bb.0: +; X86-NEXT: subl $12, %esp +; X86-NEXT: .cfi_def_cfa_offset 16 +; X86-NEXT: movzwl a, %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: calll __extendbfsf2 +; X86-NEXT: addl $12, %esp +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl +; +; X64-LABEL: bfloat_to_float: +; X64: # %bb.0: +; X64-NEXT: pushq %rax +; X64-NEXT: .cfi_def_cfa_offset 16 +; X64-NEXT: movq a@GOTPCREL(%rip), %rax +; X64-NEXT: movzwl (%rax), %edi +; X64-NEXT: callq __extendbfsf2@PLT +; X64-NEXT: popq %rax +; X64-NEXT: .cfi_def_cfa_offset 8 +; X64-NEXT: retq + %1 = load bfloat, ptr @a, align 2 + %2 = tail call float @llvm.experimental.constrained.fpext.f32.bfloat(bfloat %1, metadata !"fpexcept.strict") #0 + ret float %2 +} + +define double @bfloat_to_double() strictfp { +; X86-LABEL: bfloat_to_double: +; X86: # %bb.0: +; X86-NEXT: subl $12, %esp +; X86-NEXT: .cfi_def_cfa_offset 16 +; X86-NEXT: movzwl a, %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: calll __extendbfsf2 +; X86-NEXT: addl $12, %esp +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl +; +; X64-LABEL: bfloat_to_double: +; X64: # %bb.0: +; X64-NEXT: pushq %rax +; X64-NEXT: .cfi_def_cfa_offset 16 +; X64-NEXT: movq a@GOTPCREL(%rip), %rax +; X64-NEXT: movzwl (%rax), %edi +; X64-NEXT: callq __extendbfsf2@PLT +; X64-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; X64-NEXT: popq %rax +; X64-NEXT: .cfi_def_cfa_offset 8 +; X64-NEXT: retq + %1 = load bfloat, ptr @a, align 2 + %2 = tail call double @llvm.experimental.constrained.fpext.f64.bfloat(bfloat %1, metadata !"fpexcept.strict") #0 + ret double %2 +} + +define void @float_to_bfloat(float %0) strictfp { +; X86-LABEL: float_to_bfloat: +; X86: # %bb.0: +; X86-NEXT: subl $12, %esp +; X86-NEXT: .cfi_def_cfa_offset 16 +; X86-NEXT: flds {{[0-9]+}}(%esp) +; X86-NEXT: fstps (%esp) +; X86-NEXT: wait +; X86-NEXT: calll __truncsfbf2 +; X86-NEXT: movw %ax, a +; X86-NEXT: addl $12, %esp +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl +; +; X64-LABEL: float_to_bfloat: +; X64: # %bb.0: +; X64-NEXT: pushq %rax +; X64-NEXT: .cfi_def_cfa_offset 16 +; X64-NEXT: callq __truncsfbf2@PLT +; X64-NEXT: movq a@GOTPCREL(%rip), %rcx +; X64-NEXT: movw %ax, (%rcx) +; X64-NEXT: popq %rax +; X64-NEXT: .cfi_def_cfa_offset 8 +; X64-NEXT: retq + %2 = tail call bfloat @llvm.experimental.constrained.fptrunc.bfloat.f32(float %0, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + store bfloat %2, ptr @a, align 2 + ret void +} + +define void @double_to_bfloat(double %0) strictfp { +; X86-LABEL: double_to_bfloat: +; X86: # %bb.0: +; X86-NEXT: subl $12, %esp +; X86-NEXT: .cfi_def_cfa_offset 16 +; X86-NEXT: fldl {{[0-9]+}}(%esp) +; X86-NEXT: fstpl (%esp) +; X86-NEXT: wait +; X86-NEXT: calll __truncdfbf2 +; X86-NEXT: movw %ax, a +; X86-NEXT: addl $12, %esp +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl +; +; X64-LABEL: double_to_bfloat: +; X64: # %bb.0: +; X64-NEXT: pushq %rax +; X64-NEXT: .cfi_def_cfa_offset 16 +; X64-NEXT: callq __truncdfbf2@PLT +; X64-NEXT: movq a@GOTPCREL(%rip), %rcx +; X64-NEXT: movw %ax, (%rcx) +; X64-NEXT: popq %rax +; X64-NEXT: .cfi_def_cfa_offset 8 +; X64-NEXT: retq + %2 = tail call bfloat @llvm.experimental.constrained.fptrunc.bfloat.f64(double %0, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + store bfloat %2, ptr @a, align 2 + ret void +} + +define void @add() strictfp { +; X86-LABEL: add: +; X86: # %bb.0: +; X86-NEXT: subl $12, %esp +; X86-NEXT: .cfi_def_cfa_offset 16 +; X86-NEXT: movzwl a, %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: calll __extendbfsf2 +; X86-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: wait +; X86-NEXT: movzwl b, %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: calll __extendbfsf2 +; X86-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: faddp %st, %st(1) +; X86-NEXT: fstps (%esp) +; X86-NEXT: wait +; X86-NEXT: calll __truncsfbf2 +; X86-NEXT: movw %ax, c +; X86-NEXT: addl $12, %esp +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl +; +; X64-LABEL: add: +; X64: # %bb.0: +; X64-NEXT: pushq %rax +; X64-NEXT: .cfi_def_cfa_offset 16 +; X64-NEXT: movq a@GOTPCREL(%rip), %rax +; X64-NEXT: movzwl (%rax), %edi +; X64-NEXT: callq __extendbfsf2@PLT +; X64-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: movq b@GOTPCREL(%rip), %rax +; X64-NEXT: movzwl (%rax), %edi +; X64-NEXT: callq __extendbfsf2@PLT +; X64-NEXT: vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; X64-NEXT: callq __truncsfbf2@PLT +; X64-NEXT: movq c@GOTPCREL(%rip), %rcx +; X64-NEXT: movw %ax, (%rcx) +; X64-NEXT: popq %rax +; X64-NEXT: .cfi_def_cfa_offset 8 +; X64-NEXT: retq + %1 = load bfloat, ptr @a, align 2 + %2 = tail call float @llvm.experimental.constrained.fpext.f32.bfloat(bfloat %1, metadata !"fpexcept.strict") #0 + %3 = load bfloat, ptr @b, align 2 + %4 = tail call float @llvm.experimental.constrained.fpext.f32.bfloat(bfloat %3, metadata !"fpexcept.strict") #0 + %5 = tail call float @llvm.experimental.constrained.fadd.f32(float %2, float %4, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + %6 = tail call bfloat @llvm.experimental.constrained.fptrunc.bfloat.f32(float %5, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + store bfloat %6, ptr @c, align 2 + ret void +} + +declare float @llvm.experimental.constrained.fpext.f32.bfloat(bfloat, metadata) +declare double @llvm.experimental.constrained.fpext.f64.bfloat(bfloat, metadata) +declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata) +declare bfloat @llvm.experimental.constrained.fptrunc.bfloat.f32(float, metadata, metadata) +declare bfloat @llvm.experimental.constrained.fptrunc.bfloat.f64(double, metadata, metadata) + +attributes #0 = { strictfp } +