@@ -12096,6 +12096,38 @@ static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
12096
12096
return DAG.getBitcast(VT, Res);
12097
12097
}
12098
12098
12099
+ static SDValue lowerShuffleAsVSHLD(const SDLoc &DL, MVT VT, SDValue V1,
12100
+ SDValue V2, ArrayRef<int> Mask,
12101
+ const X86Subtarget &Subtarget,
12102
+ SelectionDAG &DAG) {
12103
+ if (!Subtarget.hasVBMI2())
12104
+ return SDValue();
12105
+ if (!Subtarget.hasVLX() && !VT.is512BitVector())
12106
+ return SDValue();
12107
+
12108
+ unsigned ScalarSizeInBits = VT.getScalarSizeInBits();
12109
+ for (int Scale = 2; Scale * ScalarSizeInBits <= 64; Scale *= 2) {
12110
+ unsigned LaneSize = Scale * ScalarSizeInBits;
12111
+ SmallVector<int, 8> RepeatedMask;
12112
+ if (isRepeatedShuffleMask(LaneSize, VT, Mask, RepeatedMask)) {
12113
+ for (int Shift = 1; Shift != Scale; ++Shift) {
12114
+ if (isSequentialOrUndefInRange(RepeatedMask, 0, Shift,
12115
+ (2 * Scale) - Shift) &&
12116
+ isSequentialOrUndefInRange(RepeatedMask, Shift, Scale - Shift, 0)) {
12117
+ MVT ShiftVT = MVT::getIntegerVT(LaneSize);
12118
+ ShiftVT = MVT::getVectorVT(ShiftVT, VT.getSizeInBits() / LaneSize);
12119
+ return DAG.getBitcast(
12120
+ VT, DAG.getNode(X86ISD::VSHLD, DL, ShiftVT, V1, V2,
12121
+ DAG.getTargetConstant(Shift * ScalarSizeInBits,
12122
+ DL, MVT::i8)));
12123
+ }
12124
+ }
12125
+ }
12126
+ }
12127
+
12128
+ return SDValue();
12129
+ }
12130
+
12099
12131
/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
12100
12132
///
12101
12133
/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
@@ -13789,6 +13821,11 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
13789
13821
return Rotate;
13790
13822
}
13791
13823
13824
+ // Try to use funnel shift instructions.
13825
+ if (SDValue Funnel =
13826
+ lowerShuffleAsVSHLD(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
13827
+ return Funnel;
13828
+
13792
13829
// Assume that a single SHUFPS is faster than an alternative sequence of
13793
13830
// multiple instructions (even if the CPU has a domain penalty).
13794
13831
// If some CPU is harmed by the domain switch, we can fix it in a later pass.
@@ -14507,6 +14544,11 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14507
14544
Subtarget, DAG))
14508
14545
return Rotate;
14509
14546
14547
+ // Try to use funnel shift instructions.
14548
+ if (SDValue Funnel =
14549
+ lowerShuffleAsVSHLD(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14550
+ return Funnel;
14551
+
14510
14552
if (SDValue BitBlend =
14511
14553
lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
14512
14554
return BitBlend;
@@ -14702,6 +14744,11 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14702
14744
Subtarget, DAG))
14703
14745
return Rotate;
14704
14746
14747
+ // Try to use funnel shift instructions.
14748
+ if (SDValue Funnel =
14749
+ lowerShuffleAsVSHLD(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14750
+ return Funnel;
14751
+
14705
14752
// Use dedicated pack instructions for masks that match their pattern.
14706
14753
if (SDValue V =
14707
14754
lowerShuffleWithPACK(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
@@ -16861,6 +16908,11 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16861
16908
Subtarget, DAG))
16862
16909
return Rotate;
16863
16910
16911
+ // Try to use funnel shift instructions.
16912
+ if (SDValue Funnel =
16913
+ lowerShuffleAsVSHLD(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16914
+ return Funnel;
16915
+
16864
16916
// Try to create an in-lane repeating shuffle mask and then shuffle the
16865
16917
// results into the target lanes.
16866
16918
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
@@ -16955,6 +17007,11 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16955
17007
Subtarget, DAG))
16956
17008
return Rotate;
16957
17009
17010
+ // Try to use funnel shift instructions.
17011
+ if (SDValue Funnel =
17012
+ lowerShuffleAsVSHLD(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17013
+ return Funnel;
17014
+
16958
17015
// Try to create an in-lane repeating shuffle mask and then shuffle the
16959
17016
// results into the target lanes.
16960
17017
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
@@ -17078,6 +17135,11 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17078
17135
Subtarget, DAG))
17079
17136
return Rotate;
17080
17137
17138
+ // Try to use funnel shift instructions.
17139
+ if (SDValue Funnel =
17140
+ lowerShuffleAsVSHLD(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17141
+ return Funnel;
17142
+
17081
17143
// Try to use bit rotation instructions.
17082
17144
if (V2.isUndef())
17083
17145
if (SDValue Rotate =
@@ -17590,6 +17652,11 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17590
17652
Subtarget, DAG))
17591
17653
return Rotate;
17592
17654
17655
+ // Try to use funnel shift instructions.
17656
+ if (SDValue Funnel =
17657
+ lowerShuffleAsVSHLD(DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17658
+ return Funnel;
17659
+
17593
17660
// Assume that a single SHUFPS is faster than using a permv shuffle.
17594
17661
// If some CPU is harmed by the domain switch, we can fix it in a later pass.
17595
17662
if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
@@ -17655,6 +17722,11 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17655
17722
Subtarget, DAG))
17656
17723
return Rotate;
17657
17724
17725
+ // Try to use funnel shift instructions.
17726
+ if (SDValue Funnel =
17727
+ lowerShuffleAsVSHLD(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17728
+ return Funnel;
17729
+
17658
17730
if (V2.isUndef()) {
17659
17731
// Try to use bit rotation instructions.
17660
17732
if (SDValue Rotate =
@@ -17726,6 +17798,11 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17726
17798
Subtarget, DAG))
17727
17799
return Rotate;
17728
17800
17801
+ // Try to use funnel shift instructions.
17802
+ if (SDValue Funnel =
17803
+ lowerShuffleAsVSHLD(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17804
+ return Funnel;
17805
+
17729
17806
// Try to use bit rotation instructions.
17730
17807
if (V2.isUndef())
17731
17808
if (SDValue Rotate =
0 commit comments