Skip to content

__bf16 treated like _Float16 under #pragma STDC FENV_ACCESS ON (x86_64 clang 18.1.0) #104465

Closed
@ngbronson-openai

Description

@ngbronson-openai

Under STDC FENV_ACCESS ON (or -ffp-exception-behavior=strict) __bf16 <-> float conversions are compiled using the half-precision intrinsics.

My test code

#pragma STDC FENV_ACCESS ON

float widenB(__bf16 x) { return x; }
 __bf16 narrowB(float x) { return x; }

float widenF(_Float16 x) { return x; }
_Float16 narrowF(float x) { return x; }

#pragma STDC FENV_ACCESS OFF

float widenB2(__bf16 x) { return x; }
__bf16 narrowB2(float x) { return x; }
float widenF2(_Float16 x) { return x; }
_Float16 narrowF2(float x) { return x; }

Compiled on godbolt x86-64 clang 18.1.0 with -march=sapphirerapids -std=gnu++2b -O3

_Z6widenBDF16b:
        vmovw   eax, xmm0
        movzx   eax, ax
        vmovd   xmm0, eax
        vcvtph2ps       xmm0, xmm0
        ret

narrowB(float):
        vxorps  xmm1, xmm1, xmm1
        vblendps        xmm0, xmm1, xmm0, 1
        vcvtps2ph       xmm0, xmm0, 4
        vmovw   eax, xmm0
        vmovw   xmm0, eax
        ret

widenF(_Float16):
        vcvtsh2ss       xmm0, xmm0, xmm0
        ret

narrowF(float):
        vcvtss2sh       xmm0, xmm0, xmm0
        ret

_Z7widenB2DF16b:
        vmovw   eax, xmm0
        shl     eax, 16
        vmovd   xmm0, eax
        ret

narrowB2(float):
        vcvtneps2bf16   xmm0, xmm0
        vmovw   eax, xmm0
        vmovw   xmm0, eax
        ret

widenF2(_Float16):
        vcvtsh2ss       xmm0, xmm0, xmm0
        ret

narrowF2(float):
        vcvtss2sh       xmm0, xmm0, xmm0
        ret

Note that BF16 intrinsics are used only for the versions without FENV_ACCESS.

With broadwell as a target the same issue (with FENV_ACCESS, __bf16 conversions are compiled as if they are _Float16)

_Z6widenBDF16b:
        vpextrw eax, xmm0, 0
        movzx   eax, ax
        vmovd   xmm0, eax
        vcvtph2ps       xmm0, xmm0
        ret

narrowB(float):
        vxorps  xmm1, xmm1, xmm1
        vblendps        xmm0, xmm1, xmm0, 1
        vcvtps2ph       xmm0, xmm0, 4
        vmovd   eax, xmm0
        vpinsrw xmm0, xmm0, eax, 0
        ret

widenF(_Float16):
        vpextrw eax, xmm0, 0
        movzx   eax, ax
        vmovd   xmm0, eax
        vcvtph2ps       xmm0, xmm0
        ret

narrowF(float):
        vxorps  xmm1, xmm1, xmm1
        vblendps        xmm0, xmm1, xmm0, 1
        vcvtps2ph       xmm0, xmm0, 4
        vmovd   eax, xmm0
        vpinsrw xmm0, xmm0, eax, 0
        ret

_Z7widenB2DF16b:
        vpextrw eax, xmm0, 0
        shl     eax, 16
        vmovd   xmm0, eax
        ret

narrowB2(float):
        push    rax
        call    __truncsfbf2@PLT
        pop     rax
        ret

widenF2(_Float16):
        vpextrw eax, xmm0, 0
        movzx   eax, ax
        vmovd   xmm0, eax
        vcvtph2ps       xmm0, xmm0
        ret

narrowF2(float):
        vcvtps2ph       xmm0, xmm0, 4
        vmovd   eax, xmm0
        vpinsrw xmm0, xmm0, eax, 0
        ret

Metadata

Metadata

Assignees

No one assigned

    Labels

    clangClang issues not falling into any other categoryfloating-pointFloating-point math

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions