Closed
Description
Under STDC FENV_ACCESS ON (or -ffp-exception-behavior=strict) __bf16 <-> float conversions are compiled using the half-precision intrinsics.
My test code
#pragma STDC FENV_ACCESS ON
float widenB(__bf16 x) { return x; }
__bf16 narrowB(float x) { return x; }
float widenF(_Float16 x) { return x; }
_Float16 narrowF(float x) { return x; }
#pragma STDC FENV_ACCESS OFF
float widenB2(__bf16 x) { return x; }
__bf16 narrowB2(float x) { return x; }
float widenF2(_Float16 x) { return x; }
_Float16 narrowF2(float x) { return x; }
Compiled on godbolt x86-64 clang 18.1.0
with -march=sapphirerapids -std=gnu++2b -O3
_Z6widenBDF16b:
vmovw eax, xmm0
movzx eax, ax
vmovd xmm0, eax
vcvtph2ps xmm0, xmm0
ret
narrowB(float):
vxorps xmm1, xmm1, xmm1
vblendps xmm0, xmm1, xmm0, 1
vcvtps2ph xmm0, xmm0, 4
vmovw eax, xmm0
vmovw xmm0, eax
ret
widenF(_Float16):
vcvtsh2ss xmm0, xmm0, xmm0
ret
narrowF(float):
vcvtss2sh xmm0, xmm0, xmm0
ret
_Z7widenB2DF16b:
vmovw eax, xmm0
shl eax, 16
vmovd xmm0, eax
ret
narrowB2(float):
vcvtneps2bf16 xmm0, xmm0
vmovw eax, xmm0
vmovw xmm0, eax
ret
widenF2(_Float16):
vcvtsh2ss xmm0, xmm0, xmm0
ret
narrowF2(float):
vcvtss2sh xmm0, xmm0, xmm0
ret
Note that BF16 intrinsics are used only for the versions without FENV_ACCESS.
With broadwell as a target the same issue (with FENV_ACCESS, __bf16 conversions are compiled as if they are _Float16)
_Z6widenBDF16b:
vpextrw eax, xmm0, 0
movzx eax, ax
vmovd xmm0, eax
vcvtph2ps xmm0, xmm0
ret
narrowB(float):
vxorps xmm1, xmm1, xmm1
vblendps xmm0, xmm1, xmm0, 1
vcvtps2ph xmm0, xmm0, 4
vmovd eax, xmm0
vpinsrw xmm0, xmm0, eax, 0
ret
widenF(_Float16):
vpextrw eax, xmm0, 0
movzx eax, ax
vmovd xmm0, eax
vcvtph2ps xmm0, xmm0
ret
narrowF(float):
vxorps xmm1, xmm1, xmm1
vblendps xmm0, xmm1, xmm0, 1
vcvtps2ph xmm0, xmm0, 4
vmovd eax, xmm0
vpinsrw xmm0, xmm0, eax, 0
ret
_Z7widenB2DF16b:
vpextrw eax, xmm0, 0
shl eax, 16
vmovd xmm0, eax
ret
narrowB2(float):
push rax
call __truncsfbf2@PLT
pop rax
ret
widenF2(_Float16):
vpextrw eax, xmm0, 0
movzx eax, ax
vmovd xmm0, eax
vcvtph2ps xmm0, xmm0
ret
narrowF2(float):
vcvtps2ph xmm0, xmm0, 4
vmovd eax, xmm0
vpinsrw xmm0, xmm0, eax, 0
ret