Skip to content

Commit 2ac85d8

Browse files
committed
[VectorCombine] foldBitcastShuf - add support for binary shuffles
Generalise fold to "bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'". Further prep work for #67803
1 parent bdc77d1 commit 2ac85d8

File tree

2 files changed

+45
-11
lines changed

2 files changed

+45
-11
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -684,10 +684,10 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) {
684684
/// destination type followed by shuffle. This can enable further transforms by
685685
/// moving bitcasts or shuffles together.
686686
bool VectorCombine::foldBitcastShuffle(Instruction &I) {
687-
Value *V0;
687+
Value *V0, *V1;
688688
ArrayRef<int> Mask;
689689
if (!match(&I, m_BitCast(m_OneUse(
690-
m_Shuffle(m_Value(V0), m_Undef(), m_Mask(Mask))))))
690+
m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(Mask))))))
691691
return false;
692692

693693
// 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
@@ -728,28 +728,33 @@ bool VectorCombine::foldBitcastShuffle(Instruction &I) {
728728
FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
729729
auto *OldShuffleTy =
730730
FixedVectorType::get(SrcTy->getScalarType(), Mask.size());
731+
bool IsUnary = isa<UndefValue>(V1);
732+
unsigned NumOps = IsUnary ? 1 : 2;
731733

732734
// The new shuffle must not cost more than the old shuffle.
733735
TargetTransformInfo::TargetCostKind CK =
734736
TargetTransformInfo::TCK_RecipThroughput;
735737
TargetTransformInfo::ShuffleKind SK =
736-
TargetTransformInfo::SK_PermuteSingleSrc;
738+
IsUnary ? TargetTransformInfo::SK_PermuteSingleSrc
739+
: TargetTransformInfo::SK_PermuteTwoSrc;
737740

738741
InstructionCost DestCost =
739742
TTI.getShuffleCost(SK, NewShuffleTy, NewMask, CK) +
740-
TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
741-
TargetTransformInfo::CastContextHint::None, CK);
743+
(NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
744+
TargetTransformInfo::CastContextHint::None,
745+
CK));
742746
InstructionCost SrcCost =
743747
TTI.getShuffleCost(SK, SrcTy, Mask, CK) +
744748
TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
745749
TargetTransformInfo::CastContextHint::None, CK);
746750
if (DestCost > SrcCost || !DestCost.isValid())
747751
return false;
748752

749-
// bitcast (shuf V0, MaskC) --> shuf (bitcast V0), MaskC'
753+
// bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'
750754
++NumShufOfBitcast;
751-
Value *CastV = Builder.CreateBitCast(V0, NewShuffleTy);
752-
Value *Shuf = Builder.CreateShuffleVector(CastV, NewMask);
755+
Value *CastV0 = Builder.CreateBitCast(V0, NewShuffleTy);
756+
Value *CastV1 = Builder.CreateBitCast(V1, NewShuffleTy);
757+
Value *Shuf = Builder.CreateShuffleVector(CastV0, CastV1, NewMask);
753758
replaceValue(I, *Shuf);
754759
return true;
755760
}

llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2-
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s
3-
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s
4-
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s
2+
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK
3+
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK
4+
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512
55

66
define <4 x i64> @PR67803(<4 x i64> %x, <4 x i64> %y, <4 x i64> %a, <4 x i64> %b) {
77
; CHECK-LABEL: @PR67803(
@@ -35,6 +35,35 @@ define <4 x i64> @PR67803(<4 x i64> %x, <4 x i64> %y, <4 x i64> %a, <4 x i64> %b
3535
; CHECK-NEXT: [[SHUFFLE_I23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3636
; CHECK-NEXT: ret <4 x i64> [[SHUFFLE_I23]]
3737
;
38+
; AVX512-LABEL: @PR67803(
39+
; AVX512-NEXT: entry:
40+
; AVX512-NEXT: [[TMP0:%.*]] = bitcast <4 x i64> [[X:%.*]] to <8 x i32>
41+
; AVX512-NEXT: [[TMP1:%.*]] = bitcast <4 x i64> [[Y:%.*]] to <8 x i32>
42+
; AVX512-NEXT: [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP0]], [[TMP1]]
43+
; AVX512-NEXT: [[CMP_I21:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
44+
; AVX512-NEXT: [[SEXT_I22:%.*]] = sext <4 x i1> [[CMP_I21]] to <4 x i32>
45+
; AVX512-NEXT: [[CMP_I:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
46+
; AVX512-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
47+
; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[SEXT_I22]], <4 x i32> [[SEXT_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
48+
; AVX512-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
49+
; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <32 x i8> [[TMP4]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
50+
; AVX512-NEXT: [[TMP6:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
51+
; AVX512-NEXT: [[TMP7:%.*]] = shufflevector <32 x i8> [[TMP6]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
52+
; AVX512-NEXT: [[TMP8:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
53+
; AVX512-NEXT: [[TMP9:%.*]] = shufflevector <32 x i8> [[TMP8]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
54+
; AVX512-NEXT: [[TMP10:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[TMP5]], <16 x i8> [[TMP7]], <16 x i8> [[TMP9]])
55+
; AVX512-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
56+
; AVX512-NEXT: [[TMP12:%.*]] = bitcast <4 x i64> [[A]] to <32 x i8>
57+
; AVX512-NEXT: [[TMP13:%.*]] = shufflevector <32 x i8> [[TMP12]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
58+
; AVX512-NEXT: [[TMP14:%.*]] = bitcast <4 x i64> [[B]] to <32 x i8>
59+
; AVX512-NEXT: [[TMP15:%.*]] = shufflevector <32 x i8> [[TMP14]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
60+
; AVX512-NEXT: [[TMP16:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
61+
; AVX512-NEXT: [[TMP17:%.*]] = shufflevector <32 x i8> [[TMP16]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
62+
; AVX512-NEXT: [[TMP18:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[TMP13]], <16 x i8> [[TMP15]], <16 x i8> [[TMP17]])
63+
; AVX512-NEXT: [[TMP19:%.*]] = bitcast <16 x i8> [[TMP18]] to <2 x i64>
64+
; AVX512-NEXT: [[SHUFFLE_I23:%.*]] = shufflevector <2 x i64> [[TMP11]], <2 x i64> [[TMP19]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
65+
; AVX512-NEXT: ret <4 x i64> [[SHUFFLE_I23]]
66+
;
3867
entry:
3968
%0 = bitcast <4 x i64> %x to <8 x i32>
4069
%extract = shufflevector <8 x i32> %0, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>

0 commit comments

Comments
 (0)