Skip to content

Commit 39a0ded

Browse files
committed
[X86] lowerShuffleAsDecomposedShuffleMerge - don't lower to unpack+permute if either source is zero.
Fixes #104482
1 parent e54f683 commit 39a0ded

File tree

2 files changed

+36
-42
lines changed

2 files changed

+36
-42
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 29 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -11156,14 +11156,15 @@ static bool isSingleElementRepeatedMask(ArrayRef<int> Mask) {
1115611156
/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
1115711157
static SDValue lowerShuffleAsDecomposedShuffleMerge(
1115811158
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11159-
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11159+
const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
1116011160
int NumElts = Mask.size();
1116111161
int NumLanes = VT.getSizeInBits() / 128;
1116211162
int NumEltsPerLane = NumElts / NumLanes;
1116311163

1116411164
// Shuffle the input elements into the desired positions in V1 and V2 and
1116511165
// unpack/blend them together.
1116611166
bool IsAlternating = true;
11167+
bool V1Zero = true, V2Zero = true;
1116711168
SmallVector<int, 32> V1Mask(NumElts, -1);
1116811169
SmallVector<int, 32> V2Mask(NumElts, -1);
1116911170
SmallVector<int, 32> FinalMask(NumElts, -1);
@@ -11172,10 +11173,12 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge(
1117211173
if (M >= 0 && M < NumElts) {
1117311174
V1Mask[i] = M;
1117411175
FinalMask[i] = i;
11176+
V1Zero &= Zeroable[i];
1117511177
IsAlternating &= (i & 1) == 0;
1117611178
} else if (M >= NumElts) {
1117711179
V2Mask[i] = M - NumElts;
1117811180
FinalMask[i] = i + NumElts;
11181+
V2Zero &= Zeroable[i];
1117911182
IsAlternating &= (i & 1) == 1;
1118011183
}
1118111184
}
@@ -11228,7 +11231,7 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge(
1122811231
// t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
1122911232
// it is better to process t4 first to create a vector of t4[0], then unpack
1123011233
// that vector with t2.
11231-
if (!isSingleElementRepeatedMask(V1Mask) &&
11234+
if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) &&
1123211235
!isSingleElementRepeatedMask(V2Mask))
1123311236
if (SDValue UnpackPerm =
1123411237
lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
@@ -12955,7 +12958,7 @@ static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1295512958
// a permute. That will be faster than the domain cross.
1295612959
if (IsBlendSupported)
1295712960
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
12958-
Subtarget, DAG);
12961+
Zeroable, Subtarget, DAG);
1295912962

1296012963
// We implement this with SHUFPD which is pretty lame because it will likely
1296112964
// incur 2 cycles of stall for integer vectors on Nehalem and older chips.
@@ -13274,7 +13277,7 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1327413277
// a permute. That will be faster than the domain cross.
1327513278
if (IsBlendSupported)
1327613279
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
13277-
Subtarget, DAG);
13280+
Zeroable, Subtarget, DAG);
1327813281

1327913282
// Try to lower by permuting the inputs into an unpack instruction.
1328013283
if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
@@ -14065,8 +14068,8 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1406514068

1406614069
// We can always bit-blend if we have to so the fallback strategy is to
1406714070
// decompose into single-input permutes and blends/unpacks.
14068-
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
14069-
Mask, Subtarget, DAG);
14071+
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask,
14072+
Zeroable, Subtarget, DAG);
1407014073
}
1407114074

1407214075
/// Lower 8-lane 16-bit floating point shuffles.
@@ -14444,7 +14447,7 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1444414447
// Handle multi-input cases by blending/unpacking single-input shuffles.
1444514448
if (NumV2Elements > 0)
1444614449
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
14447-
Subtarget, DAG);
14450+
Zeroable, Subtarget, DAG);
1444814451

1444914452
// The fallback path for single-input shuffles widens this into two v8i16
1445014453
// vectors with unpacks, shuffles those, and then pulls them back together
@@ -14668,6 +14671,7 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
1466814671
/// results.
1466914672
static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
1467014673
SDValue V2, ArrayRef<int> Mask,
14674+
const APInt &Zeroable,
1467114675
const X86Subtarget &Subtarget,
1467214676
SelectionDAG &DAG) {
1467314677
assert(!V2.isUndef() && "This routine must not be used to lower single-input "
@@ -14694,8 +14698,8 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
1469414698
return true;
1469514699
};
1469614700
if (DoBothBroadcast())
14697-
return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
14698-
DAG);
14701+
return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
14702+
Subtarget, DAG);
1469914703

1470014704
// If the inputs all stem from a single 128-bit lane of each input, then we
1470114705
// split them rather than blending because the split will decompose to
@@ -14714,8 +14718,8 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
1471414718

1471514719
// Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
1471614720
// requires that the decomposed single-input shuffles don't end up here.
14717-
return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
14718-
DAG);
14721+
return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
14722+
Subtarget, DAG);
1471914723
}
1472014724

1472114725
// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
@@ -15907,7 +15911,7 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1590715911
// blend the result.
1590815912
if (V1IsInPlace || V2IsInPlace)
1590915913
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
15910-
Subtarget, DAG);
15914+
Zeroable, Subtarget, DAG);
1591115915

1591215916
// Try to create an in-lane repeating shuffle mask and then shuffle the
1591315917
// results into the target lanes.
@@ -15934,10 +15938,10 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1593415938
// can fully permute the elements.
1593515939
if (Subtarget.hasAVX2())
1593615940
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
15937-
Subtarget, DAG);
15941+
Zeroable, Subtarget, DAG);
1593815942

1593915943
// Otherwise fall back on generic lowering.
15940-
return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
15944+
return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
1594115945
Subtarget, DAG);
1594215946
}
1594315947

@@ -16027,7 +16031,7 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1602716031
// blend the result.
1602816032
if (V1IsInPlace || V2IsInPlace)
1602916033
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16030-
Subtarget, DAG);
16034+
Zeroable, Subtarget, DAG);
1603116035

1603216036
// Try to create an in-lane repeating shuffle mask and then shuffle the
1603316037
// results into the target lanes.
@@ -16051,7 +16055,7 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1605116055

1605216056
// Otherwise fall back on generic blend lowering.
1605316057
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16054-
Subtarget, DAG);
16058+
Zeroable, Subtarget, DAG);
1605516059
}
1605616060

1605716061
/// Handle lowering of 8-lane 32-bit floating point shuffles.
@@ -16162,17 +16166,17 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1616216166
// since after split we get a more efficient code using vpunpcklwd and
1616316167
// vpunpckhwd instrs than vblend.
1616416168
if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
16165-
return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
16166-
DAG);
16169+
return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16170+
Subtarget, DAG);
1616716171

1616816172
// If we have AVX2 then we always want to lower with a blend because at v8 we
1616916173
// can fully permute the elements.
1617016174
if (Subtarget.hasAVX2())
1617116175
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
16172-
Subtarget, DAG);
16176+
Zeroable, Subtarget, DAG);
1617316177

1617416178
// Otherwise fall back on generic lowering.
16175-
return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
16179+
return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
1617616180
Subtarget, DAG);
1617716181
}
1617816182

@@ -16210,8 +16214,8 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1621016214
// vpunpcklwd and vpunpckhwd instrs.
1621116215
if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
1621216216
!Subtarget.hasAVX512())
16213-
return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
16214-
DAG);
16217+
return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16218+
Subtarget, DAG);
1621516219

1621616220
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
1621716221
Zeroable, Subtarget, DAG))
@@ -16315,7 +16319,7 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1631516319

1631616320
// Otherwise fall back on generic blend lowering.
1631716321
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
16318-
Subtarget, DAG);
16322+
Zeroable, Subtarget, DAG);
1631916323
}
1632016324

1632116325
/// Handle lowering of 16-lane 16-bit integer shuffles.
@@ -16437,7 +16441,7 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1643716441
return V;
1643816442

1643916443
// Otherwise fall back on generic lowering.
16440-
return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
16444+
return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
1644116445
Subtarget, DAG);
1644216446
}
1644316447

@@ -16558,7 +16562,7 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1655816562
return V;
1655916563

1656016564
// Otherwise fall back on generic lowering.
16561-
return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
16565+
return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
1656216566
Subtarget, DAG);
1656316567
}
1656416568

llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll

Lines changed: 7 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2775,23 +2775,13 @@ entry:
27752775
define <8 x i16> @PR104482(<16 x i8> %i) {
27762776
; SSE2-LABEL: PR104482:
27772777
; SSE2: # %bb.0:
2778-
; SSE2-NEXT: pxor %xmm2, %xmm2
2779-
; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
2780-
; SSE2-NEXT: movdqa %xmm0, %xmm1
2781-
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
2782-
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
2783-
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
2784-
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,0,1,2]
2785-
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,65535,0]
2786-
; SSE2-NEXT: pand %xmm3, %xmm1
2787-
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2788-
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
2789-
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
2790-
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,2]
2791-
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
2792-
; SSE2-NEXT: pand %xmm3, %xmm0
2793-
; SSE2-NEXT: packuswb %xmm0, %xmm1
2794-
; SSE2-NEXT: movdqa %xmm1, %xmm0
2778+
; SSE2-NEXT: pxor %xmm1, %xmm1
2779+
; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
2780+
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
2781+
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
2782+
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
2783+
; SSE2-NEXT: packuswb %xmm0, %xmm0
2784+
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
27952785
; SSE2-NEXT: retq
27962786
;
27972787
; SSSE3-LABEL: PR104482:

0 commit comments

Comments
 (0)