Skip to content

Commit ff72c83

Browse files
authored
[X86] Add missing subvector_subreg_lowering for BF16 (#83720)
Fixes: #83358
1 parent 2b5cd8b commit ff72c83

File tree

3 files changed

+28
-4
lines changed

3 files changed

+28
-4
lines changed

llvm/lib/Target/X86/X86InstrVecCompiler.td

+3
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ defm : subvector_subreg_lowering<VR128, v2f64, VR256, v4f64, sub_xmm>;
8383
defm : subvector_subreg_lowering<VR128, v8i16, VR256, v16i16, sub_xmm>;
8484
defm : subvector_subreg_lowering<VR128, v16i8, VR256, v32i8, sub_xmm>;
8585
defm : subvector_subreg_lowering<VR128, v8f16, VR256, v16f16, sub_xmm>;
86+
defm : subvector_subreg_lowering<VR128, v8bf16, VR256, v16bf16, sub_xmm>;
8687

8788
// A 128-bit subvector extract from the first 512-bit vector position is a
8889
// subregister copy that needs no instruction. Likewise, a 128-bit subvector
@@ -95,6 +96,7 @@ defm : subvector_subreg_lowering<VR128, v2f64, VR512, v8f64, sub_xmm>;
9596
defm : subvector_subreg_lowering<VR128, v8i16, VR512, v32i16, sub_xmm>;
9697
defm : subvector_subreg_lowering<VR128, v16i8, VR512, v64i8, sub_xmm>;
9798
defm : subvector_subreg_lowering<VR128, v8f16, VR512, v32f16, sub_xmm>;
99+
defm : subvector_subreg_lowering<VR128, v8bf16, VR512, v32bf16, sub_xmm>;
98100

99101
// A 128-bit subvector extract from the first 512-bit vector position is a
100102
// subregister copy that needs no instruction. Likewise, a 128-bit subvector
@@ -107,6 +109,7 @@ defm : subvector_subreg_lowering<VR256, v4f64, VR512, v8f64, sub_ymm>;
107109
defm : subvector_subreg_lowering<VR256, v16i16, VR512, v32i16, sub_ymm>;
108110
defm : subvector_subreg_lowering<VR256, v32i8, VR512, v64i8, sub_ymm>;
109111
defm : subvector_subreg_lowering<VR256, v16f16, VR512, v32f16, sub_ymm>;
112+
defm : subvector_subreg_lowering<VR256, v16bf16, VR512, v32bf16, sub_ymm>;
110113

111114

112115
// If we're inserting into an all zeros vector, just use a plain move which

llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll

+22
Original file line numberDiff line numberDiff line change
@@ -381,3 +381,25 @@ entry:
381381
%1 = shufflevector <8 x bfloat> %0, <8 x bfloat> undef, <16 x i32> zeroinitializer
382382
ret <16 x bfloat> %1
383383
}
384+
385+
define <16 x i32> @pr83358() {
386+
; X86-LABEL: pr83358:
387+
; X86: # %bb.0:
388+
; X86-NEXT: vcvtneps2bf16y {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x72,0x05,A,A,A,A]
389+
; X86-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
390+
; X86-NEXT: vshufi64x2 $0, %zmm0, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x43,0xc0,0x00]
391+
; X86-NEXT: # zmm0 = zmm0[0,1,0,1,0,1,0,1]
392+
; X86-NEXT: retl # encoding: [0xc3]
393+
;
394+
; X64-LABEL: pr83358:
395+
; X64: # %bb.0:
396+
; X64-NEXT: vcvtneps2bf16y {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x72,0x05,A,A,A,A]
397+
; X64-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
398+
; X64-NEXT: vshufi64x2 $0, %zmm0, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x43,0xc0,0x00]
399+
; X64-NEXT: # zmm0 = zmm0[0,1,0,1,0,1,0,1]
400+
; X64-NEXT: retq # encoding: [0xc3]
401+
%1 = call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00>)
402+
%2 = bitcast <8 x bfloat> %1 to <4 x i32>
403+
%3 = shufflevector <4 x i32> %2, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
404+
ret <16 x i32> %3
405+
}

llvm/test/CodeGen/X86/bfloat.ll

+3-4
Original file line numberDiff line numberDiff line change
@@ -511,7 +511,7 @@ define void @fold_ext_trunc(ptr %pa, ptr %pc) nounwind {
511511
define bfloat @fold_ext_trunc2(bfloat %a) nounwind {
512512
; X86-LABEL: fold_ext_trunc2:
513513
; X86: # %bb.0:
514-
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
514+
; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
515515
; X86-NEXT: retl
516516
;
517517
; CHECK-LABEL: fold_ext_trunc2:
@@ -934,8 +934,8 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
934934
define <2 x bfloat> @pr62997(bfloat %a, bfloat %b) {
935935
; X86-LABEL: pr62997:
936936
; X86: # %bb.0:
937-
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
938-
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1
937+
; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
938+
; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
939939
; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
940940
; X86-NEXT: retl
941941
;
@@ -2423,7 +2423,6 @@ define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind {
24232423
; AVXNC-LABEL: fptrunc_v16f32:
24242424
; AVXNC: # %bb.0:
24252425
; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm0, %xmm0
2426-
; AVXNC-NEXT: vinsertf128 $0, %xmm0, %ymm0, %ymm0
24272426
; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm1, %xmm1
24282427
; AVXNC-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
24292428
; AVXNC-NEXT: retq

0 commit comments

Comments
 (0)