Skip to content

Commit 0fb198e

Browse files
authored
[X86] Remove combineShuffleOfConcatUndef fold (#144524)
We can now let a mixture of combineConcatVectorOps and target shuffle combining handle this instead of creating ISD::CONCAT_VECTORS nodes and hoping they will merge properly. In the horizontal-sum.ll test changes we were creating a ISD::CONCAT_VECTORS node that was being split shortly after, but not before causing issues with HADD folding due to additional uses.
1 parent 8f79754 commit 0fb198e

File tree

2 files changed

+14
-68
lines changed

2 files changed

+14
-68
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 0 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -43301,51 +43301,6 @@ static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL,
4330143301
return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
4330243302
}
4330343303

43304-
// We are looking for a shuffle where both sources are concatenated with undef
43305-
// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
43306-
// if we can express this as a single-source shuffle, that's preferable.
43307-
static SDValue combineShuffleOfConcatUndef(SDNode *N, const SDLoc &DL,
43308-
SelectionDAG &DAG,
43309-
const X86Subtarget &Subtarget) {
43310-
if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
43311-
return SDValue();
43312-
43313-
EVT VT = N->getValueType(0);
43314-
43315-
// We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
43316-
if (!VT.is128BitVector() && !VT.is256BitVector())
43317-
return SDValue();
43318-
43319-
if (VT.getVectorElementType() != MVT::i32 &&
43320-
VT.getVectorElementType() != MVT::i64 &&
43321-
VT.getVectorElementType() != MVT::f32 &&
43322-
VT.getVectorElementType() != MVT::f64)
43323-
return SDValue();
43324-
43325-
SDValue N0 = N->getOperand(0);
43326-
SDValue N1 = N->getOperand(1);
43327-
43328-
// Check that both sources are concats with undef.
43329-
if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
43330-
N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
43331-
N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
43332-
!N1.getOperand(1).isUndef())
43333-
return SDValue();
43334-
43335-
// Construct the new shuffle mask. Elements from the first source retain their
43336-
// index, but elements from the second source no longer need to skip an undef.
43337-
SmallVector<int, 8> Mask;
43338-
int NumElts = VT.getVectorNumElements();
43339-
43340-
auto *SVOp = cast<ShuffleVectorSDNode>(N);
43341-
for (int Elt : SVOp->getMask())
43342-
Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
43343-
43344-
SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
43345-
N1.getOperand(0));
43346-
return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
43347-
}
43348-
4334943304
/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
4335043305
/// low half of each source vector and does not set any high half elements in
4335143306
/// the destination vector, narrow the shuffle to half its original size.
@@ -43401,15 +43356,6 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
4340143356
VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
4340243357
return LD;
4340343358

43404-
// For AVX2, we sometimes want to combine
43405-
// (vector_shuffle <mask> (concat_vectors t1, undef)
43406-
// (concat_vectors t2, undef))
43407-
// Into:
43408-
// (vector_shuffle <mask> (concat_vectors t1, t2), undef)
43409-
// Since the latter can be efficiently lowered with VPERMD/VPERMQ
43410-
if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, dl, DAG, Subtarget))
43411-
return ShufConcat;
43412-
4341343359
if (isTargetShuffle(N->getOpcode())) {
4341443360
SDValue Op(N, 0);
4341543361
if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget))

llvm/test/CodeGen/X86/horizontal-sum.ll

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -247,13 +247,13 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
247247
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
248248
; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
249249
; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1
250-
; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm8
250+
; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4
251251
; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2
252-
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1]
253-
; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0]
254-
; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm5, %xmm3
255-
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1]
256-
; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
252+
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1]
253+
; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
254+
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
255+
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
256+
; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1
257257
; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
258258
; AVX2-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
259259
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -268,13 +268,13 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
268268
; AVX2-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
269269
; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
270270
; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm4, %xmm1
271-
; AVX2-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm8
271+
; AVX2-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm4
272272
; AVX2-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm2
273-
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1]
274-
; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0]
275-
; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm5, %xmm3
276-
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1]
277-
; AVX2-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1
273+
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1]
274+
; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
275+
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
276+
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
277+
; AVX2-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1
278278
; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
279279
; AVX2-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
280280
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -424,7 +424,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
424424
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
425425
; AVX2-SLOW-NEXT: vpbroadcastd %xmm4, %xmm5
426426
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
427-
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
427+
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
428428
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
429429
; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1
430430
; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -447,7 +447,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
447447
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
448448
; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm5
449449
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
450-
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
450+
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
451451
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
452452
; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1
453453
; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]

0 commit comments

Comments
 (0)