Skip to content

[X86] Remove combineShuffleOfConcatUndef fold #144524

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 0 additions & 54 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43301,51 +43301,6 @@ static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL,
return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
}

// We are looking for a shuffle where both sources are concatenated with undef
// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
// if we can express this as a single-source shuffle, that's preferable.
static SDValue combineShuffleOfConcatUndef(SDNode *N, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
return SDValue();

EVT VT = N->getValueType(0);

// We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
if (!VT.is128BitVector() && !VT.is256BitVector())
return SDValue();

if (VT.getVectorElementType() != MVT::i32 &&
VT.getVectorElementType() != MVT::i64 &&
VT.getVectorElementType() != MVT::f32 &&
VT.getVectorElementType() != MVT::f64)
return SDValue();

SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);

// Check that both sources are concats with undef.
if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
!N1.getOperand(1).isUndef())
return SDValue();

// Construct the new shuffle mask. Elements from the first source retain their
// index, but elements from the second source no longer need to skip an undef.
SmallVector<int, 8> Mask;
int NumElts = VT.getVectorNumElements();

auto *SVOp = cast<ShuffleVectorSDNode>(N);
for (int Elt : SVOp->getMask())
Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));

SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
N1.getOperand(0));
return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
}

/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
/// low half of each source vector and does not set any high half elements in
/// the destination vector, narrow the shuffle to half its original size.
Expand Down Expand Up @@ -43401,15 +43356,6 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
return LD;

// For AVX2, we sometimes want to combine
// (vector_shuffle <mask> (concat_vectors t1, undef)
// (concat_vectors t2, undef))
// Into:
// (vector_shuffle <mask> (concat_vectors t1, t2), undef)
// Since the latter can be efficiently lowered with VPERMD/VPERMQ
if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, dl, DAG, Subtarget))
return ShufConcat;

if (isTargetShuffle(N->getOpcode())) {
SDValue Op(N, 0);
if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget))
Expand Down
28 changes: 14 additions & 14 deletions llvm/test/CodeGen/X86/horizontal-sum.ll
Original file line number Diff line number Diff line change
Expand Up @@ -247,13 +247,13 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1
; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm8
; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4
; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1]
; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0]
; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm5, %xmm3
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1]
; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1]
; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1
; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
Expand All @@ -268,13 +268,13 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; AVX2-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm4, %xmm1
; AVX2-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm8
; AVX2-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm4
; AVX2-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm2
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1]
; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0]
; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm5, %xmm3
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1]
; AVX2-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1]
; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
; AVX2-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1
; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
Expand Down Expand Up @@ -424,7 +424,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
; AVX2-SLOW-NEXT: vpbroadcastd %xmm4, %xmm5
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1
; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
Expand All @@ -447,7 +447,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm5
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1
; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
Expand Down
Loading