Skip to content

Commit e713e15

Browse files
committed
[SDISel][Builder] Fix the instantiation of <1 x bfloat|half>
Prior to this change, `SelectionDAGBuilder` was producing `SDNode`s of the form: `f32 = extract_vector_elt <1 x bfloat|half>, i32 0` when lowering phis of `<1 x bfloat|half>` and running on a target that promotes this type to `f32` (like some x86 or AMDGPU targets.) This construct is invalid since this type of node only allows type extensions for integer types. It went unotice because the `extract_vector_elt` node is later broken down in `bitcast` followed by `bf16_to_fp|fp_extend`. However, when the argument of the phi is a constant we were crashing because the existing code would try to constant fold this `extract_vector_elt` into a any_ext. This patch fixes this by using a proper decomposition for `<1 x bfloat|half>`: ``` bfloat|half = bitcast <1 x blfoat|half> float = fp_extend bfloat|half ``` This change should be NFC for the non-constant-folding cases and fix the SDISel crashes (reported in #94449) for the folding cases.
1 parent 6c9bce8 commit e713e15

File tree

3 files changed

+259
-2
lines changed

3 files changed

+259
-2
lines changed

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -729,8 +729,17 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
729729
// prevents it from being picked up by the earlier bitcast case.
730730
if (ValueVT.getVectorElementCount().isScalar() &&
731731
(!ValueVT.isFloatingPoint() || !PartVT.isInteger())) {
732-
Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val,
733-
DAG.getVectorIdxConstant(0, DL));
732+
// If we reach this condition and PartVT is FP, this means that
733+
// ValueVT is also FP and both have a different size, otherwise we
734+
// would have bitcasted them. Producing an EXTRACT_VECTOR_ELT here
735+
// would be invalid since that would mean the smaller FP type has to
736+
// be extended to the larger one.
737+
if (PartVT.isFloatingPoint()) {
738+
Val = DAG.getBitcast(ValueVT.getScalarType(), Val);
739+
Val = DAG.getNode(ISD::FP_EXTEND, DL, PartVT, Val);
740+
} else
741+
Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val,
742+
DAG.getVectorIdxConstant(0, DL));
734743
} else {
735744
uint64_t ValueSize = ValueVT.getFixedSizeInBits();
736745
assert(PartVT.getFixedSizeInBits() > ValueSize &&
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=hawaii %s -o - | FileCheck %s
3+
4+
; For all these tests we disable optimizations through function attributes
5+
; because the code we are exercising here needs phis and we want to keep the
6+
; IR small.
7+
8+
; This code used to crash in SDISel because f16 was promoted to f32 through
9+
; a `f32 = vector_extract_elt <1 x f16>, i32 0`, which is illegal.
10+
; The invalid SDNode and thus, the crash was only exposed by the constant
11+
; folding.
12+
define void @phi_vec1half_to_f32_with_const_folding(ptr addrspace(1) %dst) #0 {
13+
; CHECK-LABEL: phi_vec1half_to_f32_with_const_folding:
14+
; CHECK: ; %bb.0: ; %entry
15+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16+
; CHECK-NEXT: s_mov_b32 s4, 0
17+
; CHECK-NEXT: v_cvt_f32_f16_e64 v2, s4
18+
; CHECK-NEXT: ; %bb.1: ; %bb
19+
; CHECK-NEXT: v_cvt_f16_f32_e64 v2, v2
20+
; CHECK-NEXT: s_mov_b32 s7, 0xf000
21+
; CHECK-NEXT: s_mov_b32 s6, 0
22+
; CHECK-NEXT: s_mov_b32 s4, s6
23+
; CHECK-NEXT: s_mov_b32 s5, s6
24+
; CHECK-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 offset:2
25+
; CHECK-NEXT: v_cvt_f16_f32_e64 v2, s4
26+
; CHECK-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64
27+
; CHECK-NEXT: s_waitcnt vmcnt(0)
28+
; CHECK-NEXT: s_setpc_b64 s[30:31]
29+
entry:
30+
br label %bb
31+
32+
bb:
33+
%phi = phi <1 x half> [ zeroinitializer, %entry ]
34+
%res = shufflevector <1 x half> poison, <1 x half> %phi, <2 x i32> <i32 0, i32 1>
35+
store <2 x half> %res, ptr addrspace(1) %dst
36+
ret void
37+
}
38+
39+
; Same as phi_vec1half_to_f32_with_const_folding but without the folding.
40+
; This test exercises the same invalid SDNode, but it happened to work by
41+
; accident before. Here we make sure the fix also work as expected in the
42+
; non-constant folding case.
43+
define void @phi_vec1half_to_f32(ptr addrspace(1) %src, ptr addrspace(1) %dst) #0 {
44+
; CHECK-LABEL: phi_vec1half_to_f32:
45+
; CHECK: ; %bb.0: ; %entry
46+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
47+
; CHECK-NEXT: s_mov_b32 s7, 0xf000
48+
; CHECK-NEXT: s_mov_b32 s6, 0
49+
; CHECK-NEXT: s_mov_b32 s4, s6
50+
; CHECK-NEXT: s_mov_b32 s5, s6
51+
; CHECK-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
52+
; CHECK-NEXT: s_waitcnt vmcnt(0)
53+
; CHECK-NEXT: v_cvt_f32_f16_e64 v0, v0
54+
; CHECK-NEXT: ; %bb.1: ; %bb
55+
; CHECK-NEXT: v_cvt_f16_f32_e64 v0, v0
56+
; CHECK-NEXT: s_mov_b32 s7, 0xf000
57+
; CHECK-NEXT: s_mov_b32 s6, 0
58+
; CHECK-NEXT: s_mov_b32 s4, s6
59+
; CHECK-NEXT: s_mov_b32 s5, s6
60+
; CHECK-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 offset:2
61+
; CHECK-NEXT: v_cvt_f16_f32_e64 v0, s4
62+
; CHECK-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
63+
; CHECK-NEXT: s_waitcnt vmcnt(0)
64+
; CHECK-NEXT: s_setpc_b64 s[30:31]
65+
entry:
66+
%input = load <1 x half>, ptr addrspace(1) %src
67+
br label %bb
68+
69+
bb:
70+
%phi = phi <1 x half> [ %input, %entry ]
71+
%res = shufflevector <1 x half> poison, <1 x half> %phi, <2 x i32> <i32 0, i32 1>
72+
store <2 x half> %res, ptr addrspace(1) %dst
73+
ret void
74+
}
75+
76+
; Same as phi_vec1bf16_to_f32 but with bfloat instead of half.
77+
define void @phi_vec1bf16_to_f32(ptr addrspace(1) %src, ptr addrspace(1) %dst) #0 {
78+
; CHECK-LABEL: phi_vec1bf16_to_f32:
79+
; CHECK: ; %bb.0: ; %entry
80+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
81+
; CHECK-NEXT: s_mov_b32 s7, 0xf000
82+
; CHECK-NEXT: s_mov_b32 s6, 0
83+
; CHECK-NEXT: s_mov_b32 s4, s6
84+
; CHECK-NEXT: s_mov_b32 s5, s6
85+
; CHECK-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
86+
; CHECK-NEXT: s_mov_b32 s4, 16
87+
; CHECK-NEXT: s_waitcnt vmcnt(0)
88+
; CHECK-NEXT: v_lshlrev_b32_e64 v0, s4, v0
89+
; CHECK-NEXT: ; %bb.1: ; %bb
90+
; CHECK-NEXT: v_mul_f32_e64 v0, 1.0, v0
91+
; CHECK-NEXT: s_mov_b32 s4, 16
92+
; CHECK-NEXT: v_lshrrev_b32_e64 v0, s4, v0
93+
; CHECK-NEXT: s_mov_b32 s7, 0xf000
94+
; CHECK-NEXT: s_mov_b32 s6, 0
95+
; CHECK-NEXT: s_mov_b32 s4, s6
96+
; CHECK-NEXT: s_mov_b32 s5, s6
97+
; CHECK-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 offset:2
98+
; CHECK-NEXT: s_waitcnt vmcnt(0)
99+
; CHECK-NEXT: s_setpc_b64 s[30:31]
100+
entry:
101+
%input = load <1 x bfloat>, ptr addrspace(1) %src
102+
br label %bb
103+
104+
bb:
105+
%phi = phi <1 x bfloat> [ %input, %entry ]
106+
%res = shufflevector <1 x bfloat> poison, <1 x bfloat> %phi, <2 x i32> <i32 0, i32 1>
107+
store <2 x bfloat> %res, ptr addrspace(1) %dst
108+
ret void
109+
}
110+
111+
; Same as phi_vec1half_to_f32_with_const_folding but with bfloat instead of half.
112+
define void @phi_vec1bf16_to_f32_with_const_folding(ptr addrspace(1) %dst) #0 {
113+
; CHECK-LABEL: phi_vec1bf16_to_f32_with_const_folding:
114+
; CHECK: ; %bb.0: ; %entry
115+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
116+
; CHECK-NEXT: s_mov_b32 s4, 0
117+
; CHECK-NEXT: ; %bb.1: ; %bb
118+
; CHECK-NEXT: v_mul_f32_e64 v2, 1.0, s4
119+
; CHECK-NEXT: s_mov_b32 s4, 16
120+
; CHECK-NEXT: v_lshrrev_b32_e32 v2, s4, v2
121+
; CHECK-NEXT: s_mov_b32 s7, 0xf000
122+
; CHECK-NEXT: s_mov_b32 s6, 0
123+
; CHECK-NEXT: s_mov_b32 s4, s6
124+
; CHECK-NEXT: s_mov_b32 s5, s6
125+
; CHECK-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 offset:2
126+
; CHECK-NEXT: s_waitcnt vmcnt(0)
127+
; CHECK-NEXT: s_setpc_b64 s[30:31]
128+
entry:
129+
br label %bb
130+
131+
bb:
132+
%phi = phi <1 x bfloat> [ zeroinitializer, %entry ]
133+
%res = shufflevector <1 x bfloat> poison, <1 x bfloat> %phi, <2 x i32> <i32 0, i32 1>
134+
store <2 x bfloat> %res, ptr addrspace(1) %dst
135+
ret void
136+
}
137+
138+
attributes #0 = { noinline optnone }
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -global-isel=0 -mcpu=generic -mtriple=x86_64-apple-darwin %s -o - | FileCheck %s
3+
4+
; For all these tests we disable optimizations through function attributes
5+
; because the code we are exercising here needs phis and we want to keep the
6+
; IR small.
7+
8+
; This code used to crash in SDISel because f16 was promoted to f32 through
9+
; a `f32 = vector_extract_elt <1 x f16>, i32 0`, which is illegal.
10+
; The invalid SDNode and thus, the crash was only exposed by the constant
11+
; folding.
12+
define void @phi_vec1half_to_f32_with_const_folding(ptr %dst) #0 {
13+
; CHECK-LABEL: phi_vec1half_to_f32_with_const_folding:
14+
; CHECK: ## %bb.0: ## %entry
15+
; CHECK-NEXT: xorps %xmm0, %xmm0
16+
; CHECK-NEXT: jmp LBB0_1
17+
; CHECK-NEXT: LBB0_1: ## %bb
18+
; CHECK-NEXT: pextrw $0, %xmm0, %eax
19+
; CHECK-NEXT: movw %ax, 2(%rdi)
20+
; CHECK-NEXT: retq
21+
entry:
22+
br label %bb
23+
24+
bb:
25+
%phi = phi <1 x half> [ zeroinitializer, %entry ]
26+
%res = shufflevector <1 x half> poison, <1 x half> %phi, <2 x i32> <i32 0, i32 1>
27+
store <2 x half> %res, ptr %dst
28+
ret void
29+
}
30+
31+
; Same as phi_vec1half_to_f32_with_const_folding but without the folding.
32+
; This test exercises the same invalid SDNode, but it happened to work by
33+
; accident before. Here we make sure the fix also work as expected in the
34+
; non-constant folding case.
35+
define void @phi_vec1half_to_f32(ptr %src, ptr %dst) #0 {
36+
; CHECK-LABEL: phi_vec1half_to_f32:
37+
; CHECK: ## %bb.0: ## %entry
38+
; CHECK-NEXT: pinsrw $0, (%rdi), %xmm0
39+
; CHECK-NEXT: jmp LBB1_1
40+
; CHECK-NEXT: LBB1_1: ## %bb
41+
; CHECK-NEXT: pextrw $0, %xmm0, %eax
42+
; CHECK-NEXT: movw %ax, 2(%rsi)
43+
; CHECK-NEXT: retq
44+
entry:
45+
%input = load <1 x half>, ptr %src
46+
br label %bb
47+
48+
bb:
49+
%phi = phi <1 x half> [ %input, %entry ]
50+
%res = shufflevector <1 x half> poison, <1 x half> %phi, <2 x i32> <i32 0, i32 1>
51+
store <2 x half> %res, ptr %dst
52+
ret void
53+
}
54+
55+
; Same as phi_vec1bf16_to_f32 but with bfloat instead of half.
56+
define void @phi_vec1bf16_to_f32(ptr %src, ptr %dst) #0 {
57+
; CHECK-LABEL: phi_vec1bf16_to_f32:
58+
; CHECK: ## %bb.0: ## %entry
59+
; CHECK-NEXT: pushq %rbx
60+
; CHECK-NEXT: .cfi_def_cfa_offset 16
61+
; CHECK-NEXT: .cfi_offset %rbx, -16
62+
; CHECK-NEXT: movq %rsi, %rbx
63+
; CHECK-NEXT: movzwl (%rdi), %eax
64+
; CHECK-NEXT: shll $16, %eax
65+
; CHECK-NEXT: movd %eax, %xmm0
66+
; CHECK-NEXT: jmp LBB2_1
67+
; CHECK-NEXT: LBB2_1: ## %bb
68+
; CHECK-NEXT: callq ___truncsfbf2
69+
; CHECK-NEXT: pextrw $0, %xmm0, %eax
70+
; CHECK-NEXT: movw %ax, 2(%rbx)
71+
; CHECK-NEXT: popq %rbx
72+
; CHECK-NEXT: retq
73+
entry:
74+
%input = load <1 x bfloat>, ptr %src
75+
br label %bb
76+
77+
bb:
78+
%phi = phi <1 x bfloat> [ %input, %entry ]
79+
%res = shufflevector <1 x bfloat> poison, <1 x bfloat> %phi, <2 x i32> <i32 0, i32 1>
80+
store <2 x bfloat> %res, ptr %dst
81+
ret void
82+
}
83+
84+
; Same as phi_vec1half_to_f32_with_const_folding but with bfloat instead of half.
85+
define void @phi_vec1bf16_to_f32_with_const_folding(ptr %dst) #0 {
86+
; CHECK-LABEL: phi_vec1bf16_to_f32_with_const_folding:
87+
; CHECK: ## %bb.0: ## %entry
88+
; CHECK-NEXT: pushq %rbx
89+
; CHECK-NEXT: .cfi_def_cfa_offset 16
90+
; CHECK-NEXT: .cfi_offset %rbx, -16
91+
; CHECK-NEXT: movq %rdi, %rbx
92+
; CHECK-NEXT: jmp LBB3_1
93+
; CHECK-NEXT: LBB3_1: ## %bb
94+
; CHECK-NEXT: xorps %xmm0, %xmm0
95+
; CHECK-NEXT: callq ___truncsfbf2
96+
; CHECK-NEXT: pextrw $0, %xmm0, %eax
97+
; CHECK-NEXT: movw %ax, 2(%rbx)
98+
; CHECK-NEXT: popq %rbx
99+
; CHECK-NEXT: retq
100+
entry:
101+
br label %bb
102+
103+
bb:
104+
%phi = phi <1 x bfloat> [ zeroinitializer, %entry ]
105+
%res = shufflevector <1 x bfloat> poison, <1 x bfloat> %phi, <2 x i32> <i32 0, i32 1>
106+
store <2 x bfloat> %res, ptr %dst
107+
ret void
108+
}
109+
110+
attributes #0 = { noinline optnone }

0 commit comments

Comments
 (0)