Skip to content

Commit 352df10

Browse files
committed
[X86][AVX] matchShuffleAsBlend - use isElementEquivalent to help match broadcast/repeated elements
Extend matchShuffleAsBlend to not only match against known in-place elements for BLEND shuffles, but use isElementEquivalent to determine if the shuffle mask's referenced element is the same as the in-place element. This allows us to replace a number of insertps instructions with more general blendps instructions (better opportunities for commutation, concatenation etc.).
1 parent 96fb3ee commit 352df10

File tree

4 files changed

+55
-51
lines changed

4 files changed

+55
-51
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

+7-2
Original file line numberDiff line numberDiff line change
@@ -12248,10 +12248,15 @@ static bool matchShuffleAsBlend(SDValue V1, SDValue V2,
1224812248
int M = Mask[i];
1224912249
if (M == SM_SentinelUndef)
1225012250
continue;
12251-
if (M == i)
12251+
if (M == i ||
12252+
(0 <= M && M < Size && IsElementEquivalent(Size, V1, V1, M, i))) {
12253+
Mask[i] = i;
1225212254
continue;
12253-
if (M == i + Size) {
12255+
}
12256+
if (M == (i + Size) ||
12257+
(Size <= M && IsElementEquivalent(Size, V2, V2, M - Size, i))) {
1225412258
BlendMask |= 1ull << i;
12259+
Mask[i] = i + Size;
1225512260
continue;
1225612261
}
1225712262
if (Zeroable[i]) {

llvm/test/CodeGen/X86/avx.ll

+8-9
Original file line numberDiff line numberDiff line change
@@ -138,30 +138,29 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float
138138
ret <4 x float> %7
139139
}
140140

141-
;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
142141
define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
143142
; X86-LABEL: insertps_from_broadcast_multiple_use:
144143
; X86: ## %bb.0:
145144
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
146145
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
147146
; X86-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4
148-
; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
149-
; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
147+
; X86-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
148+
; X86-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
150149
; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
151-
; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
152-
; X86-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
150+
; X86-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3]
151+
; X86-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3]
153152
; X86-NEXT: vaddps %xmm2, %xmm1, %xmm1
154153
; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
155154
; X86-NEXT: retl
156155
;
157156
; X64-LABEL: insertps_from_broadcast_multiple_use:
158157
; X64: ## %bb.0:
159158
; X64-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm4
160-
; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
161-
; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
159+
; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
160+
; X64-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
162161
; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
163-
; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
164-
; X64-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
162+
; X64-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3]
163+
; X64-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3]
165164
; X64-NEXT: vaddps %xmm2, %xmm1, %xmm1
166165
; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
167166
; X64-NEXT: retq

llvm/test/CodeGen/X86/horizontal-sum.ll

+6-6
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
3939
; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,1]
4040
; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1]
4141
; AVX1-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1
42-
; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[1]
42+
; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
4343
; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
4444
; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
4545
; AVX1-SLOW-NEXT: retq
@@ -58,7 +58,7 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
5858
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,3]
5959
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1]
6060
; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1
61-
; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[1]
61+
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
6262
; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
6363
; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
6464
; AVX2-SLOW-NEXT: retq
@@ -227,7 +227,7 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
227227
; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1]
228228
; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
229229
; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
230-
; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[1]
230+
; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
231231
; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1
232232
; AVX1-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
233233
; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
@@ -248,7 +248,7 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
248248
; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1]
249249
; AVX1-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
250250
; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
251-
; AVX1-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[1]
251+
; AVX1-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
252252
; AVX1-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1
253253
; AVX1-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
254254
; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
@@ -271,7 +271,7 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
271271
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3]
272272
; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
273273
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
274-
; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[1]
274+
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
275275
; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1
276276
; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
277277
; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
@@ -292,7 +292,7 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
292292
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3]
293293
; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
294294
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
295-
; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[1]
295+
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
296296
; AVX2-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1
297297
; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
298298
; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]

llvm/test/CodeGen/X86/sse41.ll

+34-34
Original file line numberDiff line numberDiff line change
@@ -1661,15 +1661,15 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
16611661
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
16621662
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
16631663
; X86-AVX1-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]
1664-
; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
1665-
; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0]
1666-
; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
1667-
; X86-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0]
1664+
; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
1665+
; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3]
1666+
; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
1667+
; X86-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3]
16681668
; X86-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
1669-
; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
1670-
; X86-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0]
1671-
; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
1672-
; X86-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0]
1669+
; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08]
1670+
; X86-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[3]
1671+
; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08]
1672+
; X86-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[3]
16731673
; X86-AVX1-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]
16741674
; X86-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
16751675
; X86-AVX1-NEXT: retl ## encoding: [0xc3]
@@ -1679,16 +1679,16 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
16791679
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
16801680
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
16811681
; X86-AVX512-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]
1682-
; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
1683-
; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0]
1684-
; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
1685-
; X86-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0]
1682+
; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
1683+
; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3]
1684+
; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
1685+
; X86-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3]
1686+
; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08]
1687+
; X86-AVX512-NEXT: ## xmm2 = xmm2[0,1,2],xmm4[3]
1688+
; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08]
1689+
; X86-AVX512-NEXT: ## xmm3 = xmm3[0,1,2],xmm4[3]
16861690
; X86-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
1687-
; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
1688-
; X86-AVX512-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0]
1689-
; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
1690-
; X86-AVX512-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0]
1691-
; X86-AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
1691+
; X86-AVX512-NEXT: vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb]
16921692
; X86-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
16931693
; X86-AVX512-NEXT: retl ## encoding: [0xc3]
16941694
;
@@ -1712,32 +1712,32 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
17121712
; X64-AVX1-LABEL: insertps_from_broadcast_multiple_use:
17131713
; X64-AVX1: ## %bb.0:
17141714
; X64-AVX1-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7]
1715-
; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
1716-
; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0]
1717-
; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
1718-
; X64-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0]
1715+
; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
1716+
; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3]
1717+
; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
1718+
; X64-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3]
17191719
; X64-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
1720-
; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
1721-
; X64-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0]
1722-
; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
1723-
; X64-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0]
1720+
; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08]
1721+
; X64-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[3]
1722+
; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08]
1723+
; X64-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[3]
17241724
; X64-AVX1-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]
17251725
; X64-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
17261726
; X64-AVX1-NEXT: retq ## encoding: [0xc3]
17271727
;
17281728
; X64-AVX512-LABEL: insertps_from_broadcast_multiple_use:
17291729
; X64-AVX512: ## %bb.0:
17301730
; X64-AVX512-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7]
1731-
; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
1732-
; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0]
1733-
; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
1734-
; X64-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0]
1731+
; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
1732+
; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3]
1733+
; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
1734+
; X64-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3]
1735+
; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08]
1736+
; X64-AVX512-NEXT: ## xmm2 = xmm2[0,1,2],xmm4[3]
1737+
; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08]
1738+
; X64-AVX512-NEXT: ## xmm3 = xmm3[0,1,2],xmm4[3]
17351739
; X64-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
1736-
; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
1737-
; X64-AVX512-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0]
1738-
; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
1739-
; X64-AVX512-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0]
1740-
; X64-AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
1740+
; X64-AVX512-NEXT: vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb]
17411741
; X64-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
17421742
; X64-AVX512-NEXT: retq ## encoding: [0xc3]
17431743
%1 = getelementptr inbounds float, float* %fb, i64 %index

0 commit comments

Comments
 (0)