@@ -8194,8 +8194,8 @@ static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op,
81948194 Input = Op->getOperand(1);
81958195 Elt -= 4;
81968196 }
8197- SDValue BitCast = DAG.getBitcast(MVT::v4i32 , Input);
8198- Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32 , BitCast,
8197+ SDValue BitCast = DAG.getBitcast(MVT::v4f32 , Input);
8198+ Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32 , BitCast,
81998199 DAG.getConstant(Elt, dl, MVT::i32));
82008200 }
82018201 }
@@ -8214,19 +8214,70 @@ static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op,
82148214 Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]);
82158215 SDValue NewShuffle = DAG.getVectorShuffle(
82168216 VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask);
8217- SDValue BitCast = DAG.getBitcast(MVT::v4i32 , NewShuffle);
8217+ SDValue BitCast = DAG.getBitcast(MVT::v4f32 , NewShuffle);
82188218
82198219 for (int Part = 0; Part < 4; ++Part)
82208220 if (!Parts[Part])
8221- Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32 ,
8221+ Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32 ,
82228222 BitCast, DAG.getConstant(Part, dl, MVT::i32));
82238223 }
82248224 // Build a vector out of the various parts and bitcast it back to the original
82258225 // type.
8226- SDValue NewVec = DAG.getBuildVector(MVT::v4i32 , dl, Parts);
8226+ SDValue NewVec = DAG.getNode(ARMISD::BUILD_VECTOR , dl, MVT::v4f32 , Parts);
82278227 return DAG.getBitcast(VT, NewVec);
82288228}
82298229
8230+ static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op,
8231+ ArrayRef<int> ShuffleMask,
8232+ SelectionDAG &DAG) {
8233+ SDValue V1 = Op.getOperand(0);
8234+ SDValue V2 = Op.getOperand(1);
8235+ EVT VT = Op.getValueType();
8236+ unsigned NumElts = VT.getVectorNumElements();
8237+
8238+ // An One-Off Identity mask is one that is mostly an identity mask from as
8239+ // single source but contains a single element out-of-place, either from a
8240+ // different vector or from another position in the same vector. As opposed to
8241+ // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert
8242+ // pair directly.
8243+ auto isOneOffIdentityMask = [](ArrayRef<int> Mask, EVT VT, int BaseOffset,
8244+ int &OffElement) {
8245+ OffElement = -1;
8246+ int NonUndef = 0;
8247+ for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) {
8248+ if (Mask[i] == -1)
8249+ continue;
8250+ NonUndef++;
8251+ if (Mask[i] != i + BaseOffset) {
8252+ if (OffElement == -1)
8253+ OffElement = i;
8254+ else
8255+ return false;
8256+ }
8257+ }
8258+ return NonUndef > 2 && OffElement != -1;
8259+ };
8260+ int OffElement;
8261+ SDValue VInput;
8262+ if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement))
8263+ VInput = V1;
8264+ else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement))
8265+ VInput = V2;
8266+ else
8267+ return SDValue();
8268+
8269+ SDLoc dl(Op);
8270+ EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16
8271+ ? MVT::i32
8272+ : VT.getScalarType();
8273+ SDValue Elt = DAG.getNode(
8274+ ISD::EXTRACT_VECTOR_ELT, dl, SVT,
8275+ ShuffleMask[OffElement] < (int)NumElts ? V1 : V2,
8276+ DAG.getVectorIdxConstant(ShuffleMask[OffElement] % NumElts, dl));
8277+ return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, VInput, Elt,
8278+ DAG.getVectorIdxConstant(OffElement % NumElts, dl));
8279+ }
8280+
82308281static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
82318282 const ARMSubtarget *ST) {
82328283 SDValue V1 = Op.getOperand(0);
@@ -8360,6 +8411,10 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
83608411 }
83618412 }
83628413
8414+ if (ST->hasMVEIntegerOps() && EltSize <= 32)
8415+ if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG))
8416+ return V;
8417+
83638418 // If the shuffle is not directly supported and it has 4 elements, use
83648419 // the PerfectShuffle-generated table to synthesize it from other shuffles.
83658420 unsigned NumElts = VT.getVectorNumElements();
0 commit comments