Skip to content

Commit 9aa7041

Browse files
committed
[X86] Canonicalize VPERMV3 to VPERMV if both sources are the same.
1 parent f9d6d46 commit 9aa7041

6 files changed

+51
-60
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

+21-10
Original file line numberDiff line numberDiff line change
@@ -42301,11 +42301,11 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
4230142301
return SDValue();
4230242302
}
4230342303
case X86ISD::VPERMV3: {
42304-
// Combine VPERMV3 to widened VPERMV if the two source operands are split
42305-
// from the same vector.
4230642304
SDValue V1 = peekThroughBitcasts(N.getOperand(0));
4230742305
SDValue V2 = peekThroughBitcasts(N.getOperand(2));
4230842306
MVT SVT = V1.getSimpleValueType();
42307+
// Combine VPERMV3 to widened VPERMV if the two source operands are split
42308+
// from the same vector.
4230942309
if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4231042310
V1.getConstantOperandVal(1) == 0 &&
4231142311
V2.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
@@ -42326,14 +42326,25 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
4232642326
}
4232742327
SmallVector<SDValue, 2> Ops;
4232842328
SmallVector<int, 32> Mask;
42329-
if (isShuffleFoldableLoad(N.getOperand(0)) &&
42330-
!isShuffleFoldableLoad(N.getOperand(2)) &&
42331-
getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) {
42332-
ShuffleVectorSDNode::commuteMask(Mask);
42333-
SDValue NewMask = getConstVector(
42334-
Mask, N.getOperand(1).getSimpleValueType(), DAG, DL, /*IsMask=*/true);
42335-
return DAG.getNode(X86ISD::VPERMV3, DL, VT, N.getOperand(2), NewMask,
42336-
N.getOperand(0));
42329+
if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) {
42330+
MVT MaskVT = N.getOperand(1).getSimpleValueType();
42331+
// Canonicalize to VPERMV if both sources are the same.
42332+
if (V1 == V2) {
42333+
for (int &M : Mask)
42334+
M = (M < 0 ? M : M & Mask.size() - 1);
42335+
SDValue NewMask = getConstVector(Mask, MaskVT, DAG, DL,
42336+
/*IsMask=*/true);
42337+
return DAG.getNode(X86ISD::VPERMV, DL, VT, NewMask, N.getOperand(0));
42338+
}
42339+
// Commute foldable source to the RHS.
42340+
if (isShuffleFoldableLoad(N.getOperand(0)) &&
42341+
!isShuffleFoldableLoad(N.getOperand(2))) {
42342+
ShuffleVectorSDNode::commuteMask(Mask);
42343+
SDValue NewMask =
42344+
getConstVector(Mask, MaskVT, DAG, DL, /*IsMask=*/true);
42345+
return DAG.getNode(X86ISD::VPERMV3, DL, VT, N.getOperand(2), NewMask,
42346+
N.getOperand(0));
42347+
}
4233742348
}
4233842349
return SDValue();
4233942350
}

llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll

+10-10
Original file line numberDiff line numberDiff line change
@@ -1599,12 +1599,12 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
15991599
; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
16001600
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3
16011601
; AVX512BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
1602-
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [4,6,4,6,8,10,9,11]
1603-
; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm3, %zmm4
1602+
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [4,6,4,6,0,2,1,3]
1603+
; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm4, %zmm4
16041604
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u,u,u,u,u,38,46,u,u,u,u,39,47,u,u,u,u,48,56,u,u,u,u,49,57,u,u,u,u,50,58,u,u]
1605-
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,2,0,2,12,14,13,15]
1606-
; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm3, %zmm5
1607-
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm5[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29,37,45,u,u,u,u,38,46,u,u,u,u,39,47,u,u,u,u,48,56,u,u,u,u,49,57,u,u,u,u,50,58]
1605+
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,2,0,2,4,6,5,7]
1606+
; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm5, %zmm3
1607+
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29,37,45,u,u,u,u,38,46,u,u,u,u,39,47,u,u,u,u,48,56,u,u,u,u,49,57,u,u,u,u,50,58]
16081608
; AVX512BW-FCP-NEXT: movl $1227105426, %ecx # imm = 0x49242492
16091609
; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
16101610
; AVX512BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm3 {%k1}
@@ -1685,12 +1685,12 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
16851685
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
16861686
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3
16871687
; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
1688-
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [4,6,4,6,8,10,9,11]
1689-
; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm3, %zmm4
1688+
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [4,6,4,6,0,2,1,3]
1689+
; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm4, %zmm4
16901690
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u,u,u,u,u,38,46,u,u,u,u,39,47,u,u,u,u,48,56,u,u,u,u,49,57,u,u,u,u,50,58,u,u]
1691-
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,2,0,2,12,14,13,15]
1692-
; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm3, %zmm5
1693-
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm5[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29,37,45,u,u,u,u,38,46,u,u,u,u,39,47,u,u,u,u,48,56,u,u,u,u,49,57,u,u,u,u,50,58]
1691+
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,2,0,2,4,6,5,7]
1692+
; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm5, %zmm3
1693+
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29,37,45,u,u,u,u,38,46,u,u,u,u,39,47,u,u,u,u,48,56,u,u,u,u,49,57,u,u,u,u,50,58]
16941694
; AVX512DQ-BW-FCP-NEXT: movl $1227105426, %ecx # imm = 0x49242492
16951695
; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
16961696
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm3 {%k1}

llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll

+2-4
Original file line numberDiff line numberDiff line change
@@ -24,17 +24,15 @@ define <32 x i16> @combine_vpermt2var_32i16_identity_mask(<32 x i16> %x0, <32 x
2424
; X86-NEXT: vpmovsxbw {{.*#+}} zmm1 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
2525
; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1
2626
; X86-NEXT: vpermt2w %zmm0, %zmm1, %zmm0 {%k1} {z}
27-
; X86-NEXT: vpmovsxbw {{.*#+}} zmm1 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
28-
; X86-NEXT: vpermt2w %zmm0, %zmm1, %zmm0 {%k1} {z}
27+
; X86-NEXT: vpermw %zmm0, %zmm1, %zmm0 {%k1} {z}
2928
; X86-NEXT: retl
3029
;
3130
; X64-LABEL: combine_vpermt2var_32i16_identity_mask:
3231
; X64: # %bb.0:
3332
; X64-NEXT: vpmovsxbw {{.*#+}} zmm1 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
3433
; X64-NEXT: kmovd %edi, %k1
3534
; X64-NEXT: vpermt2w %zmm0, %zmm1, %zmm0 {%k1} {z}
36-
; X64-NEXT: vpmovsxbw {{.*#+}} zmm1 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
37-
; X64-NEXT: vpermt2w %zmm0, %zmm1, %zmm0 {%k1} {z}
35+
; X64-NEXT: vpermw %zmm0, %zmm1, %zmm0 {%k1} {z}
3836
; X64-NEXT: retq
3937
%res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> %x0, <32 x i16> %x1, i32 %m)
4038
%res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 63, i16 30, i16 61, i16 28, i16 59, i16 26, i16 57, i16 24, i16 55, i16 22, i16 53, i16 20, i16 51, i16 18, i16 49, i16 16, i16 47, i16 46, i16 13, i16 44, i16 11, i16 42, i16 9, i16 40, i16 7, i16 38, i16 5, i16 36, i16 3, i16 34, i16 1, i16 32>, <32 x i16> %res0, <32 x i16> %res0, i32 %m)

llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll

+2-4
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,15 @@ define <16 x i16> @combine_vpermt2var_16i16_identity_mask(<16 x i16> %x0, <16 x
1919
; X86-NEXT: vpmovsxbw {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
2020
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2121
; X86-NEXT: vpermt2w %ymm0, %ymm1, %ymm0 {%k1} {z}
22-
; X86-NEXT: vpmovsxbw {{.*#+}} ymm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
23-
; X86-NEXT: vpermt2w %ymm0, %ymm1, %ymm0 {%k1} {z}
22+
; X86-NEXT: vpermw %ymm0, %ymm1, %ymm0 {%k1} {z}
2423
; X86-NEXT: retl
2524
;
2625
; X64-LABEL: combine_vpermt2var_16i16_identity_mask:
2726
; X64: # %bb.0:
2827
; X64-NEXT: vpmovsxbw {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
2928
; X64-NEXT: kmovd %edi, %k1
3029
; X64-NEXT: vpermt2w %ymm0, %ymm1, %ymm0 {%k1} {z}
31-
; X64-NEXT: vpmovsxbw {{.*#+}} ymm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
32-
; X64-NEXT: vpermt2w %ymm0, %ymm1, %ymm0 {%k1} {z}
30+
; X64-NEXT: vpermw %ymm0, %ymm1, %ymm0 {%k1} {z}
3331
; X64-NEXT: retq
3432
%res0 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <16 x i16> %x0, <16 x i16> %x1, i16 %m)
3533
%res1 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> <i16 15, i16 30, i16 13, i16 28, i16 11, i16 26, i16 9, i16 24, i16 7, i16 22, i16 5, i16 20, i16 3, i16 18, i16 1, i16 16>, <16 x i16> %res0, <16 x i16> %res0, i16 %m)

llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll

+14-28
Original file line numberDiff line numberDiff line change
@@ -155,8 +155,7 @@ define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8
155155
; X86-AVX512F-NEXT: movzbl {{[0-9]+}}(%esp), %eax
156156
; X86-AVX512F-NEXT: kmovw %eax, %k1
157157
; X86-AVX512F-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
158-
; X86-AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
159-
; X86-AVX512F-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
158+
; X86-AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
160159
; X86-AVX512F-NEXT: retl
161160
;
162161
; X86-AVX512BW-LABEL: combine_vpermt2var_8f64_identity_mask:
@@ -165,26 +164,23 @@ define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8
165164
; X86-AVX512BW-NEXT: movzbl {{[0-9]+}}(%esp), %eax
166165
; X86-AVX512BW-NEXT: kmovd %eax, %k1
167166
; X86-AVX512BW-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
168-
; X86-AVX512BW-NEXT: vmovapd {{.*#+}} zmm1 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
169-
; X86-AVX512BW-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
167+
; X86-AVX512BW-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
170168
; X86-AVX512BW-NEXT: retl
171169
;
172170
; X64-AVX512F-LABEL: combine_vpermt2var_8f64_identity_mask:
173171
; X64-AVX512F: # %bb.0:
174172
; X64-AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
175173
; X64-AVX512F-NEXT: kmovw %edi, %k1
176174
; X64-AVX512F-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
177-
; X64-AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8]
178-
; X64-AVX512F-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
175+
; X64-AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
179176
; X64-AVX512F-NEXT: retq
180177
;
181178
; X64-AVX512BW-LABEL: combine_vpermt2var_8f64_identity_mask:
182179
; X64-AVX512BW: # %bb.0:
183180
; X64-AVX512BW-NEXT: vmovapd {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
184181
; X64-AVX512BW-NEXT: kmovd %edi, %k1
185182
; X64-AVX512BW-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
186-
; X64-AVX512BW-NEXT: vmovapd {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8]
187-
; X64-AVX512BW-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
183+
; X64-AVX512BW-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
188184
; X64-AVX512BW-NEXT: retq
189185
%res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x0, <8 x double> %x1, i8 %m)
190186
%res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, <8 x double> %res0, i8 %m)
@@ -259,8 +255,7 @@ define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64>
259255
; X86-AVX512F-NEXT: movzbl {{[0-9]+}}(%esp), %eax
260256
; X86-AVX512F-NEXT: kmovw %eax, %k1
261257
; X86-AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
262-
; X86-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8]
263-
; X86-AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
258+
; X86-AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
264259
; X86-AVX512F-NEXT: retl
265260
;
266261
; X86-AVX512BW-LABEL: combine_vpermt2var_8i64_identity_mask:
@@ -269,26 +264,23 @@ define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64>
269264
; X86-AVX512BW-NEXT: movzbl {{[0-9]+}}(%esp), %eax
270265
; X86-AVX512BW-NEXT: kmovd %eax, %k1
271266
; X86-AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
272-
; X86-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8]
273-
; X86-AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
267+
; X86-AVX512BW-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
274268
; X86-AVX512BW-NEXT: retl
275269
;
276270
; X64-AVX512F-LABEL: combine_vpermt2var_8i64_identity_mask:
277271
; X64-AVX512F: # %bb.0:
278272
; X64-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
279273
; X64-AVX512F-NEXT: kmovw %edi, %k1
280274
; X64-AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
281-
; X64-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8]
282-
; X64-AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
275+
; X64-AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
283276
; X64-AVX512F-NEXT: retq
284277
;
285278
; X64-AVX512BW-LABEL: combine_vpermt2var_8i64_identity_mask:
286279
; X64-AVX512BW: # %bb.0:
287280
; X64-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
288281
; X64-AVX512BW-NEXT: kmovd %edi, %k1
289282
; X64-AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
290-
; X64-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8]
291-
; X64-AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
283+
; X64-AVX512BW-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
292284
; X64-AVX512BW-NEXT: retq
293285
%res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x0, <8 x i64> %x1, i8 %m)
294286
%res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, <8 x i64> %res0, i8 %m)
@@ -309,26 +301,23 @@ define <16 x float> @combine_vpermt2var_16f32_identity_mask(<16 x float> %x0, <1
309301
; X86-NEXT: vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
310302
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
311303
; X86-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
312-
; X86-NEXT: vmovaps {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
313-
; X86-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
304+
; X86-NEXT: vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
314305
; X86-NEXT: retl
315306
;
316307
; X64-AVX512F-LABEL: combine_vpermt2var_16f32_identity_mask:
317308
; X64-AVX512F: # %bb.0:
318309
; X64-AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
319310
; X64-AVX512F-NEXT: kmovw %edi, %k1
320311
; X64-AVX512F-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
321-
; X64-AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
322-
; X64-AVX512F-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
312+
; X64-AVX512F-NEXT: vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
323313
; X64-AVX512F-NEXT: retq
324314
;
325315
; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_identity_mask:
326316
; X64-AVX512BW: # %bb.0:
327317
; X64-AVX512BW-NEXT: vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
328318
; X64-AVX512BW-NEXT: kmovd %edi, %k1
329319
; X64-AVX512BW-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
330-
; X64-AVX512BW-NEXT: vmovaps {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
331-
; X64-AVX512BW-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
320+
; X64-AVX512BW-NEXT: vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
332321
; X64-AVX512BW-NEXT: retq
333322
%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x0, <16 x float> %x1, i16 %m)
334323
%res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, <16 x float> %res0, i16 %m)
@@ -598,26 +587,23 @@ define <16 x i32> @combine_vpermt2var_16i32_identity_mask(<16 x i32> %x0, <16 x
598587
; X86-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
599588
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
600589
; X86-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z}
601-
; X86-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
602-
; X86-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z}
590+
; X86-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
603591
; X86-NEXT: retl
604592
;
605593
; X64-AVX512F-LABEL: combine_vpermt2var_16i32_identity_mask:
606594
; X64-AVX512F: # %bb.0:
607595
; X64-AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
608596
; X64-AVX512F-NEXT: kmovw %edi, %k1
609597
; X64-AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z}
610-
; X64-AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
611-
; X64-AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z}
598+
; X64-AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
612599
; X64-AVX512F-NEXT: retq
613600
;
614601
; X64-AVX512BW-LABEL: combine_vpermt2var_16i32_identity_mask:
615602
; X64-AVX512BW: # %bb.0:
616603
; X64-AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
617604
; X64-AVX512BW-NEXT: kmovd %edi, %k1
618605
; X64-AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z}
619-
; X64-AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
620-
; X64-AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z}
606+
; X64-AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
621607
; X64-AVX512BW-NEXT: retq
622608
%res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x i32> %x0, <16 x i32> %x1, i16 %m)
623609
%res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, <16 x i32> %res0, i16 %m)

llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll

+2-4
Original file line numberDiff line numberDiff line change
@@ -32,17 +32,15 @@ define <16 x i8> @combine_vpermt2var_16i8_identity_mask(<16 x i8> %x0, <16 x i8>
3232
; X86-NEXT: vmovdqa {{.*#+}} xmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
3333
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3434
; X86-NEXT: vpermt2b %xmm0, %xmm1, %xmm0 {%k1} {z}
35-
; X86-NEXT: vmovdqa {{.*#+}} xmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
36-
; X86-NEXT: vpermt2b %xmm0, %xmm1, %xmm0 {%k1} {z}
35+
; X86-NEXT: vpermb %xmm0, %xmm1, %xmm0 {%k1} {z}
3736
; X86-NEXT: retl
3837
;
3938
; X64-LABEL: combine_vpermt2var_16i8_identity_mask:
4039
; X64: # %bb.0:
4140
; X64-NEXT: vmovdqa {{.*#+}} xmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
4241
; X64-NEXT: kmovd %edi, %k1
4342
; X64-NEXT: vpermt2b %xmm0, %xmm1, %xmm0 {%k1} {z}
44-
; X64-NEXT: vmovdqa {{.*#+}} xmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
45-
; X64-NEXT: vpermt2b %xmm0, %xmm1, %xmm0 {%k1} {z}
43+
; X64-NEXT: vpermb %xmm0, %xmm1, %xmm0 {%k1} {z}
4644
; X64-NEXT: retq
4745
%res0 = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> %x0, <16 x i8> %x1, i16 %m)
4846
%res1 = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> <i8 15, i8 30, i8 13, i8 28, i8 11, i8 26, i8 9, i8 24, i8 7, i8 22, i8 5, i8 20, i8 3, i8 18, i8 1, i8 16>, <16 x i8> %res0, <16 x i8> %res0, i16 %m)

0 commit comments

Comments
 (0)