@@ -11156,14 +11156,15 @@ static bool isSingleElementRepeatedMask(ArrayRef<int> Mask) {
11156
11156
/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
11157
11157
static SDValue lowerShuffleAsDecomposedShuffleMerge(
11158
11158
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11159
- const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11159
+ const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11160
11160
int NumElts = Mask.size();
11161
11161
int NumLanes = VT.getSizeInBits() / 128;
11162
11162
int NumEltsPerLane = NumElts / NumLanes;
11163
11163
11164
11164
// Shuffle the input elements into the desired positions in V1 and V2 and
11165
11165
// unpack/blend them together.
11166
11166
bool IsAlternating = true;
11167
+ bool V1Zero = true, V2Zero = true;
11167
11168
SmallVector<int, 32> V1Mask(NumElts, -1);
11168
11169
SmallVector<int, 32> V2Mask(NumElts, -1);
11169
11170
SmallVector<int, 32> FinalMask(NumElts, -1);
@@ -11172,10 +11173,12 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge(
11172
11173
if (M >= 0 && M < NumElts) {
11173
11174
V1Mask[i] = M;
11174
11175
FinalMask[i] = i;
11176
+ V1Zero &= Zeroable[i];
11175
11177
IsAlternating &= (i & 1) == 0;
11176
11178
} else if (M >= NumElts) {
11177
11179
V2Mask[i] = M - NumElts;
11178
11180
FinalMask[i] = i + NumElts;
11181
+ V2Zero &= Zeroable[i];
11179
11182
IsAlternating &= (i & 1) == 1;
11180
11183
}
11181
11184
}
@@ -11228,7 +11231,7 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge(
11228
11231
// t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
11229
11232
// it is better to process t4 first to create a vector of t4[0], then unpack
11230
11233
// that vector with t2.
11231
- if (!isSingleElementRepeatedMask(V1Mask) &&
11234
+ if (!V1Zero && !V2Zero && ! isSingleElementRepeatedMask(V1Mask) &&
11232
11235
!isSingleElementRepeatedMask(V2Mask))
11233
11236
if (SDValue UnpackPerm =
11234
11237
lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
@@ -12955,7 +12958,7 @@ static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
12955
12958
// a permute. That will be faster than the domain cross.
12956
12959
if (IsBlendSupported)
12957
12960
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
12958
- Subtarget, DAG);
12961
+ Zeroable, Subtarget, DAG);
12959
12962
12960
12963
// We implement this with SHUFPD which is pretty lame because it will likely
12961
12964
// incur 2 cycles of stall for integer vectors on Nehalem and older chips.
@@ -13274,7 +13277,7 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
13274
13277
// a permute. That will be faster than the domain cross.
13275
13278
if (IsBlendSupported)
13276
13279
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
13277
- Subtarget, DAG);
13280
+ Zeroable, Subtarget, DAG);
13278
13281
13279
13282
// Try to lower by permuting the inputs into an unpack instruction.
13280
13283
if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
@@ -14065,8 +14068,8 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14065
14068
14066
14069
// We can always bit-blend if we have to so the fallback strategy is to
14067
14070
// decompose into single-input permutes and blends/unpacks.
14068
- return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
14069
- Mask , Subtarget, DAG);
14071
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask,
14072
+ Zeroable , Subtarget, DAG);
14070
14073
}
14071
14074
14072
14075
/// Lower 8-lane 16-bit floating point shuffles.
@@ -14444,7 +14447,7 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14444
14447
// Handle multi-input cases by blending/unpacking single-input shuffles.
14445
14448
if (NumV2Elements > 0)
14446
14449
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
14447
- Subtarget, DAG);
14450
+ Zeroable, Subtarget, DAG);
14448
14451
14449
14452
// The fallback path for single-input shuffles widens this into two v8i16
14450
14453
// vectors with unpacks, shuffles those, and then pulls them back together
@@ -14668,6 +14671,7 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
14668
14671
/// results.
14669
14672
static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
14670
14673
SDValue V2, ArrayRef<int> Mask,
14674
+ const APInt &Zeroable,
14671
14675
const X86Subtarget &Subtarget,
14672
14676
SelectionDAG &DAG) {
14673
14677
assert(!V2.isUndef() && "This routine must not be used to lower single-input "
@@ -14694,8 +14698,8 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
14694
14698
return true;
14695
14699
};
14696
14700
if (DoBothBroadcast())
14697
- return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget ,
14698
- DAG);
14701
+ return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable ,
14702
+ Subtarget, DAG);
14699
14703
14700
14704
// If the inputs all stem from a single 128-bit lane of each input, then we
14701
14705
// split them rather than blending because the split will decompose to
@@ -14714,8 +14718,8 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
14714
14718
14715
14719
// Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
14716
14720
// requires that the decomposed single-input shuffles don't end up here.
14717
- return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget ,
14718
- DAG);
14721
+ return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable ,
14722
+ Subtarget, DAG);
14719
14723
}
14720
14724
14721
14725
// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
@@ -15907,7 +15911,7 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15907
15911
// blend the result.
15908
15912
if (V1IsInPlace || V2IsInPlace)
15909
15913
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
15910
- Subtarget, DAG);
15914
+ Zeroable, Subtarget, DAG);
15911
15915
15912
15916
// Try to create an in-lane repeating shuffle mask and then shuffle the
15913
15917
// results into the target lanes.
@@ -15934,10 +15938,10 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15934
15938
// can fully permute the elements.
15935
15939
if (Subtarget.hasAVX2())
15936
15940
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
15937
- Subtarget, DAG);
15941
+ Zeroable, Subtarget, DAG);
15938
15942
15939
15943
// Otherwise fall back on generic lowering.
15940
- return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
15944
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
15941
15945
Subtarget, DAG);
15942
15946
}
15943
15947
@@ -16027,7 +16031,7 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16027
16031
// blend the result.
16028
16032
if (V1IsInPlace || V2IsInPlace)
16029
16033
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16030
- Subtarget, DAG);
16034
+ Zeroable, Subtarget, DAG);
16031
16035
16032
16036
// Try to create an in-lane repeating shuffle mask and then shuffle the
16033
16037
// results into the target lanes.
@@ -16051,7 +16055,7 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16051
16055
16052
16056
// Otherwise fall back on generic blend lowering.
16053
16057
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16054
- Subtarget, DAG);
16058
+ Zeroable, Subtarget, DAG);
16055
16059
}
16056
16060
16057
16061
/// Handle lowering of 8-lane 32-bit floating point shuffles.
@@ -16162,17 +16166,17 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16162
16166
// since after split we get a more efficient code using vpunpcklwd and
16163
16167
// vpunpckhwd instrs than vblend.
16164
16168
if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
16165
- return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget ,
16166
- DAG);
16169
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable ,
16170
+ Subtarget, DAG);
16167
16171
16168
16172
// If we have AVX2 then we always want to lower with a blend because at v8 we
16169
16173
// can fully permute the elements.
16170
16174
if (Subtarget.hasAVX2())
16171
16175
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
16172
- Subtarget, DAG);
16176
+ Zeroable, Subtarget, DAG);
16173
16177
16174
16178
// Otherwise fall back on generic lowering.
16175
- return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
16179
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16176
16180
Subtarget, DAG);
16177
16181
}
16178
16182
@@ -16210,8 +16214,8 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16210
16214
// vpunpcklwd and vpunpckhwd instrs.
16211
16215
if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
16212
16216
!Subtarget.hasAVX512())
16213
- return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget ,
16214
- DAG);
16217
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable ,
16218
+ Subtarget, DAG);
16215
16219
16216
16220
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
16217
16221
Zeroable, Subtarget, DAG))
@@ -16315,7 +16319,7 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16315
16319
16316
16320
// Otherwise fall back on generic blend lowering.
16317
16321
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
16318
- Subtarget, DAG);
16322
+ Zeroable, Subtarget, DAG);
16319
16323
}
16320
16324
16321
16325
/// Handle lowering of 16-lane 16-bit integer shuffles.
@@ -16437,7 +16441,7 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16437
16441
return V;
16438
16442
16439
16443
// Otherwise fall back on generic lowering.
16440
- return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
16444
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16441
16445
Subtarget, DAG);
16442
16446
}
16443
16447
@@ -16558,7 +16562,7 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16558
16562
return V;
16559
16563
16560
16564
// Otherwise fall back on generic lowering.
16561
- return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
16565
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
16562
16566
Subtarget, DAG);
16563
16567
}
16564
16568
0 commit comments