diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index cd02d275d6b57..12fcc614ab254 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -43301,51 +43301,6 @@ static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL, return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1); } -// We are looking for a shuffle where both sources are concatenated with undef -// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so -// if we can express this as a single-source shuffle, that's preferable. -static SDValue combineShuffleOfConcatUndef(SDNode *N, const SDLoc &DL, - SelectionDAG &DAG, - const X86Subtarget &Subtarget) { - if (!Subtarget.hasAVX2() || !isa(N)) - return SDValue(); - - EVT VT = N->getValueType(0); - - // We only care about shuffles of 128/256-bit vectors of 32/64-bit values. - if (!VT.is128BitVector() && !VT.is256BitVector()) - return SDValue(); - - if (VT.getVectorElementType() != MVT::i32 && - VT.getVectorElementType() != MVT::i64 && - VT.getVectorElementType() != MVT::f32 && - VT.getVectorElementType() != MVT::f64) - return SDValue(); - - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - - // Check that both sources are concats with undef. - if (N0.getOpcode() != ISD::CONCAT_VECTORS || - N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 || - N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() || - !N1.getOperand(1).isUndef()) - return SDValue(); - - // Construct the new shuffle mask. Elements from the first source retain their - // index, but elements from the second source no longer need to skip an undef. - SmallVector Mask; - int NumElts = VT.getVectorNumElements(); - - auto *SVOp = cast(N); - for (int Elt : SVOp->getMask()) - Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2)); - - SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0), - N1.getOperand(0)); - return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask); -} - /// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the /// low half of each source vector and does not set any high half elements in /// the destination vector, narrow the shuffle to half its original size. @@ -43401,15 +43356,6 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true)) return LD; - // For AVX2, we sometimes want to combine - // (vector_shuffle (concat_vectors t1, undef) - // (concat_vectors t2, undef)) - // Into: - // (vector_shuffle (concat_vectors t1, t2), undef) - // Since the latter can be efficiently lowered with VPERMD/VPERMQ - if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, dl, DAG, Subtarget)) - return ShufConcat; - if (isTargetShuffle(N->getOpcode())) { SDValue Op(N, 0); if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget)) diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll index 0afc4f784bc5e..568150cfa3971 100644 --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -247,13 +247,13 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1 -; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm8 +; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4 ; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1] -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0] -; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1] -; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1] +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1 ; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -268,13 +268,13 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; AVX2-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm4, %xmm1 -; AVX2-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm8 +; AVX2-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm4 ; AVX2-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1] -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0] -; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm5, %xmm3 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1] -; AVX2-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1] +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] +; AVX2-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1 ; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -424,7 +424,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpbroadcastd %xmm4, %xmm5 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -447,7 +447,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] ; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]