Skip to content

Commit 7812fcf

Browse files
committed
[VectorCombine] foldBitcastShuf - add support for binary shuffles (REAPPLIED)
Generalise fold to "bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'". Reapplied with a clang codegen test fix. Further prep work for #67803
1 parent caf8b1f commit 7812fcf

File tree

3 files changed

+51
-14
lines changed

3 files changed

+51
-14
lines changed

clang/test/CodeGen/X86/avx-shuffle-builtins.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,8 @@ __m256 test_mm256_permute2f128_ps(__m256 a, __m256 b) {
6060

6161
__m256i test_mm256_permute2f128_si256(__m256i a, __m256i b) {
6262
// CHECK-LABEL: test_mm256_permute2f128_si256
63-
// CHECK: shufflevector{{.*}} <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
63+
// X64: shufflevector{{.*}}<i32 0, i32 1, i32 4, i32 5>
64+
// X86: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
6465
return _mm256_permute2f128_si256(a, b, 0x20);
6566
}
6667

@@ -104,7 +105,8 @@ __m256d test_mm256_insertf128_pd_0(__m256d a, __m128d b) {
104105

105106
__m256i test_mm256_insertf128_si256_0(__m256i a, __m128i b) {
106107
// CHECK-LABEL: test_mm256_insertf128_si256_0
107-
// CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
108+
// X64: shufflevector{{.*}}<i32 0, i32 1, i32 6, i32 7>
109+
// X86: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
108110
return _mm256_insertf128_si256(a, b, 0);
109111
}
110112

@@ -122,7 +124,8 @@ __m256d test_mm256_insertf128_pd_1(__m256d a, __m128d b) {
122124

123125
__m256i test_mm256_insertf128_si256_1(__m256i a, __m128i b) {
124126
// CHECK-LABEL: test_mm256_insertf128_si256_1
125-
// CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
127+
// X64: shufflevector{{.*}}<i32 0, i32 1, i32 4, i32 5>
128+
// X86: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
126129
return _mm256_insertf128_si256(a, b, 1);
127130
}
128131

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -684,10 +684,10 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) {
684684
/// destination type followed by shuffle. This can enable further transforms by
685685
/// moving bitcasts or shuffles together.
686686
bool VectorCombine::foldBitcastShuffle(Instruction &I) {
687-
Value *V0;
687+
Value *V0, *V1;
688688
ArrayRef<int> Mask;
689689
if (!match(&I, m_BitCast(m_OneUse(
690-
m_Shuffle(m_Value(V0), m_Undef(), m_Mask(Mask))))))
690+
m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(Mask))))))
691691
return false;
692692

693693
// 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
@@ -728,28 +728,33 @@ bool VectorCombine::foldBitcastShuffle(Instruction &I) {
728728
FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
729729
auto *OldShuffleTy =
730730
FixedVectorType::get(SrcTy->getScalarType(), Mask.size());
731+
bool IsUnary = isa<UndefValue>(V1);
732+
unsigned NumOps = IsUnary ? 1 : 2;
731733

732734
// The new shuffle must not cost more than the old shuffle.
733735
TargetTransformInfo::TargetCostKind CK =
734736
TargetTransformInfo::TCK_RecipThroughput;
735737
TargetTransformInfo::ShuffleKind SK =
736-
TargetTransformInfo::SK_PermuteSingleSrc;
738+
IsUnary ? TargetTransformInfo::SK_PermuteSingleSrc
739+
: TargetTransformInfo::SK_PermuteTwoSrc;
737740

738741
InstructionCost DestCost =
739742
TTI.getShuffleCost(SK, NewShuffleTy, NewMask, CK) +
740-
TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
741-
TargetTransformInfo::CastContextHint::None, CK);
743+
(NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
744+
TargetTransformInfo::CastContextHint::None,
745+
CK));
742746
InstructionCost SrcCost =
743747
TTI.getShuffleCost(SK, SrcTy, Mask, CK) +
744748
TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
745749
TargetTransformInfo::CastContextHint::None, CK);
746750
if (DestCost > SrcCost || !DestCost.isValid())
747751
return false;
748752

749-
// bitcast (shuf V0, MaskC) --> shuf (bitcast V0), MaskC'
753+
// bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'
750754
++NumShufOfBitcast;
751-
Value *CastV = Builder.CreateBitCast(V0, NewShuffleTy);
752-
Value *Shuf = Builder.CreateShuffleVector(CastV, NewMask);
755+
Value *CastV0 = Builder.CreateBitCast(V0, NewShuffleTy);
756+
Value *CastV1 = Builder.CreateBitCast(V1, NewShuffleTy);
757+
Value *Shuf = Builder.CreateShuffleVector(CastV0, CastV1, NewMask);
753758
replaceValue(I, *Shuf);
754759
return true;
755760
}

llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2-
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s
3-
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s
4-
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s
2+
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK
3+
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK
4+
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512
55

66
define <4 x i64> @PR67803(<4 x i64> %x, <4 x i64> %y, <4 x i64> %a, <4 x i64> %b) {
77
; CHECK-LABEL: @PR67803(
@@ -35,6 +35,35 @@ define <4 x i64> @PR67803(<4 x i64> %x, <4 x i64> %y, <4 x i64> %a, <4 x i64> %b
3535
; CHECK-NEXT: [[SHUFFLE_I23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3636
; CHECK-NEXT: ret <4 x i64> [[SHUFFLE_I23]]
3737
;
38+
; AVX512-LABEL: @PR67803(
39+
; AVX512-NEXT: entry:
40+
; AVX512-NEXT: [[TMP0:%.*]] = bitcast <4 x i64> [[X:%.*]] to <8 x i32>
41+
; AVX512-NEXT: [[TMP1:%.*]] = bitcast <4 x i64> [[Y:%.*]] to <8 x i32>
42+
; AVX512-NEXT: [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP0]], [[TMP1]]
43+
; AVX512-NEXT: [[CMP_I21:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
44+
; AVX512-NEXT: [[SEXT_I22:%.*]] = sext <4 x i1> [[CMP_I21]] to <4 x i32>
45+
; AVX512-NEXT: [[CMP_I:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
46+
; AVX512-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
47+
; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[SEXT_I22]], <4 x i32> [[SEXT_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
48+
; AVX512-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
49+
; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <32 x i8> [[TMP4]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
50+
; AVX512-NEXT: [[TMP6:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
51+
; AVX512-NEXT: [[TMP7:%.*]] = shufflevector <32 x i8> [[TMP6]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
52+
; AVX512-NEXT: [[TMP8:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
53+
; AVX512-NEXT: [[TMP9:%.*]] = shufflevector <32 x i8> [[TMP8]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
54+
; AVX512-NEXT: [[TMP10:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[TMP5]], <16 x i8> [[TMP7]], <16 x i8> [[TMP9]])
55+
; AVX512-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
56+
; AVX512-NEXT: [[TMP12:%.*]] = bitcast <4 x i64> [[A]] to <32 x i8>
57+
; AVX512-NEXT: [[TMP13:%.*]] = shufflevector <32 x i8> [[TMP12]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
58+
; AVX512-NEXT: [[TMP14:%.*]] = bitcast <4 x i64> [[B]] to <32 x i8>
59+
; AVX512-NEXT: [[TMP15:%.*]] = shufflevector <32 x i8> [[TMP14]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
60+
; AVX512-NEXT: [[TMP16:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
61+
; AVX512-NEXT: [[TMP17:%.*]] = shufflevector <32 x i8> [[TMP16]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
62+
; AVX512-NEXT: [[TMP18:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[TMP13]], <16 x i8> [[TMP15]], <16 x i8> [[TMP17]])
63+
; AVX512-NEXT: [[TMP19:%.*]] = bitcast <16 x i8> [[TMP18]] to <2 x i64>
64+
; AVX512-NEXT: [[SHUFFLE_I23:%.*]] = shufflevector <2 x i64> [[TMP11]], <2 x i64> [[TMP19]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
65+
; AVX512-NEXT: ret <4 x i64> [[SHUFFLE_I23]]
66+
;
3867
entry:
3968
%0 = bitcast <4 x i64> %x to <8 x i32>
4069
%extract = shufflevector <8 x i32> %0, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>

0 commit comments

Comments
 (0)