-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[InstCombine][X86] Only demand the active index bits for VPERMV/VPERMV3 mask values #106750
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesVPERMV/VPERMV3 only uses the lower bits of the vector element indices - so use SimplifyDemandedBits to ignore anything touching the remaining bits. Fixes #106413 Patch is 26.51 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/106750.diff 3 Files Affected:
diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index 9cc5ed5d89ad70..e96ce28616c898 100644
--- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -2144,6 +2144,22 @@ static Value *simplifyX86vpermv3(const IntrinsicInst &II,
return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, Size));
}
+// Simplify VPERMV/VPERMV3 mask - only demand the active index bits.
+static bool simplifyX86VPERMMask(Instruction *II, bool IsBinary,
+ InstCombiner &IC) {
+ auto *VecTy = cast<FixedVectorType>(II->getType());
+ unsigned EltSizeInBits = VecTy->getScalarSizeInBits();
+ unsigned NumElts = VecTy->getNumElements();
+ assert(isPowerOf2_32(NumElts) && isPowerOf2_32(EltSizeInBits) &&
+ "Unexpected shuffle mask size");
+
+ unsigned IdxSizeInBits = Log2_32(IsBinary ? (2 * NumElts) : NumElts);
+ APInt DemandedMask = APInt::getLowBitsSet(EltSizeInBits, IdxSizeInBits);
+
+ KnownBits KnownMask(EltSizeInBits);
+ return IC.SimplifyDemandedBits(II, 1, DemandedMask, KnownMask);
+}
+
std::optional<Instruction *>
X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
@@ -3004,6 +3020,8 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
return IC.replaceInstUsesWith(II, V);
}
+ if (simplifyX86VPERMMask(&II, false, IC))
+ return &II;
break;
case Intrinsic::x86_avx512_vpermi2var_d_128:
@@ -3027,6 +3045,8 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
if (Value *V = simplifyX86vpermv3(II, IC.Builder)) {
return IC.replaceInstUsesWith(II, V);
}
+ if (simplifyX86VPERMMask(&II, true, IC))
+ return &II;
break;
case Intrinsic::x86_avx_maskload_ps:
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-vperm.ll b/llvm/test/Transforms/InstCombine/X86/x86-vperm.ll
index 6519e4f5348484..9a52c62d01404e 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-vperm.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-vperm.ll
@@ -91,8 +91,7 @@ define <8 x i32> @undef_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passt
define <8 x i32> @demandedbit_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-LABEL: @demandedbit_test_permvar_si_256_mask(
-; CHECK-NEXT: [[M:%.*]] = or <8 x i32> [[A1:%.*]], <i32 0, i32 8, i32 -8, i32 16, i32 -16, i32 32, i32 -32, i32 64>
-; CHECK-NEXT: [[S:%.*]] = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> [[A0:%.*]], <8 x i32> [[M]])
+; CHECK-NEXT: [[S:%.*]] = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> [[A0:%.*]], <8 x i32> [[A1:%.*]])
; CHECK-NEXT: ret <8 x i32> [[S]]
;
%m = or <8 x i32> %a1, <i32 0, i32 8, i32 -8, i32 16, i32 -16, i32 32, i32 -32, i32 64>
@@ -190,8 +189,7 @@ define <8 x float> @undef_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float>
define <8 x float> @demandedbit_test_permvar_sf_256_mask(<8 x float> %a0, <8 x i32> %a1) {
; CHECK-LABEL: @demandedbit_test_permvar_sf_256_mask(
-; CHECK-NEXT: [[M:%.*]] = or <8 x i32> [[A1:%.*]], <i32 0, i32 8, i32 -8, i32 16, i32 -16, i32 32, i32 -32, i32 64>
-; CHECK-NEXT: [[S:%.*]] = call <8 x float> @llvm.x86.avx2.permps(<8 x float> [[A0:%.*]], <8 x i32> [[M]])
+; CHECK-NEXT: [[S:%.*]] = call <8 x float> @llvm.x86.avx2.permps(<8 x float> [[A0:%.*]], <8 x i32> [[A1:%.*]])
; CHECK-NEXT: ret <8 x float> [[S]]
;
%m = or <8 x i32> %a1, <i32 0, i32 8, i32 -8, i32 16, i32 -16, i32 32, i32 -32, i32 64>
@@ -297,8 +295,7 @@ define <4 x i64> @undef_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passt
define <4 x i64> @demandedbits_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %a1) {
; CHECK-LABEL: @demandedbits_test_permvar_di_256_mask(
-; CHECK-NEXT: [[M:%.*]] = or <4 x i64> [[A1:%.*]], <i64 0, i64 4, i64 -4, i64 8>
-; CHECK-NEXT: [[S:%.*]] = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> [[A0:%.*]], <4 x i64> [[M]])
+; CHECK-NEXT: [[S:%.*]] = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> [[A0:%.*]], <4 x i64> [[A1:%.*]])
; CHECK-NEXT: ret <4 x i64> [[S]]
;
%m = or <4 x i64> %a1, <i64 0, i64 4, i64 -4, i64 8>
@@ -404,8 +401,7 @@ define <4 x double> @undef_test_permvar_df_256_mask(<4 x double> %a0, <4 x doubl
define <4 x double> @demandedbits_test_permvar_df_256_mask(<4 x double> %a0, <4 x i64> %a1) {
; CHECK-LABEL: @demandedbits_test_permvar_df_256_mask(
-; CHECK-NEXT: [[M:%.*]] = or <4 x i64> [[A1:%.*]], <i64 0, i64 4, i64 -4, i64 8>
-; CHECK-NEXT: [[S:%.*]] = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> [[A0:%.*]], <4 x i64> [[M]])
+; CHECK-NEXT: [[S:%.*]] = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> [[A0:%.*]], <4 x i64> [[A1:%.*]])
; CHECK-NEXT: ret <4 x double> [[S]]
;
%m = or <4 x i64> %a1, <i64 0, i64 4, i64 -4, i64 8>
@@ -503,8 +499,7 @@ define <16 x i32> @undef_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %pa
define <16 x i32> @demandedbit_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %a1) {
; CHECK-LABEL: @demandedbit_test_permvar_si_512_mask(
-; CHECK-NEXT: [[M:%.*]] = or <16 x i32> [[A1:%.*]], <i32 0, i32 16, i32 -16, i32 32, i32 -32, i32 64, i32 -64, i32 128, i32 -128, i32 256, i32 -256, i32 512, i32 -512, i32 1024, i32 -1024, i32 2048>
-; CHECK-NEXT: [[S:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[A0:%.*]], <16 x i32> [[M]])
+; CHECK-NEXT: [[S:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1:%.*]])
; CHECK-NEXT: ret <16 x i32> [[S]]
;
%m = or <16 x i32> %a1, <i32 0, i32 16, i32 -16, i32 32, i32 -32, i32 64, i32 -64, i32 128, i32 -128, i32 256, i32 -256, i32 512, i32 -512, i32 1024, i32 -1024, i32 2048>
@@ -602,8 +597,7 @@ define <16 x float> @undef_test_permvar_sf_512_mask(<16 x float> %a0, <16 x floa
define <16 x float> @demandedbit_test_permvar_sf_512_mask(<16 x float> %a0, <16 x i32> %a1) {
; CHECK-LABEL: @demandedbit_test_permvar_sf_512_mask(
-; CHECK-NEXT: [[M:%.*]] = or <16 x i32> [[A1:%.*]], <i32 0, i32 16, i32 -16, i32 32, i32 -32, i32 64, i32 -64, i32 128, i32 -128, i32 256, i32 -256, i32 512, i32 -512, i32 1024, i32 -1024, i32 2048>
-; CHECK-NEXT: [[S:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[A0:%.*]], <16 x i32> [[M]])
+; CHECK-NEXT: [[S:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[A0:%.*]], <16 x i32> [[A1:%.*]])
; CHECK-NEXT: ret <16 x float> [[S]]
;
%m = or <16 x i32> %a1, <i32 0, i32 16, i32 -16, i32 32, i32 -32, i32 64, i32 -64, i32 128, i32 -128, i32 256, i32 -256, i32 512, i32 -512, i32 1024, i32 -1024, i32 2048>
@@ -701,8 +695,7 @@ define <8 x i64> @undef_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passt
define <8 x i64> @demandedbit_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %a1) {
; CHECK-LABEL: @demandedbit_test_permvar_di_512_mask(
-; CHECK-NEXT: [[M:%.*]] = or <8 x i64> [[A1:%.*]], <i64 0, i64 8, i64 -8, i64 16, i64 -16, i64 32, i64 -32, i64 64>
-; CHECK-NEXT: [[S:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[A0:%.*]], <8 x i64> [[M]])
+; CHECK-NEXT: [[S:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1:%.*]])
; CHECK-NEXT: ret <8 x i64> [[S]]
;
%m = or <8 x i64> %a1, <i64 0, i64 8, i64 -8, i64 16, i64 -16, i64 32, i64 -32, i64 64>
@@ -800,8 +793,7 @@ define <8 x double> @undef_test_permvar_df_512_mask(<8 x double> %a0, <8 x doubl
define <8 x double> @demandedbit_test_permvar_df_512_mask(<8 x double> %a0, <8 x i64> %a1) {
; CHECK-LABEL: @demandedbit_test_permvar_df_512_mask(
-; CHECK-NEXT: [[M:%.*]] = or <8 x i64> [[A1:%.*]], <i64 0, i64 8, i64 -8, i64 16, i64 -16, i64 32, i64 -32, i64 64>
-; CHECK-NEXT: [[S:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[A0:%.*]], <8 x i64> [[M]])
+; CHECK-NEXT: [[S:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[A0:%.*]], <8 x i64> [[A1:%.*]])
; CHECK-NEXT: ret <8 x double> [[S]]
;
%m = or <8 x i64> %a1, <i64 0, i64 8, i64 -8, i64 16, i64 -16, i64 32, i64 -32, i64 64>
@@ -899,8 +891,7 @@ define <8 x i16> @undef_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passt
define <8 x i16> @demandedbit_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: @demandedbit_test_permvar_hi_128_mask(
-; CHECK-NEXT: [[M:%.*]] = or <8 x i16> [[A1:%.*]], <i16 0, i16 8, i16 -8, i16 16, i16 -16, i16 32, i16 -32, i16 64>
-; CHECK-NEXT: [[S:%.*]] = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> [[A0:%.*]], <8 x i16> [[M]])
+; CHECK-NEXT: [[S:%.*]] = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]])
; CHECK-NEXT: ret <8 x i16> [[S]]
;
%m = or <8 x i16> %a1, <i16 0, i16 8, i16 -8, i16 16, i16 -16, i16 32, i16 -32, i16 64>
@@ -998,8 +989,7 @@ define <16 x i16> @undef_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %pa
define <16 x i16> @demandedbit_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %a1) {
; CHECK-LABEL: @demandedbit_test_permvar_hi_256_mask(
-; CHECK-NEXT: [[M:%.*]] = or <16 x i16> [[A1:%.*]], <i16 0, i16 16, i16 -16, i16 32, i16 -32, i16 64, i16 -64, i16 128, i16 -128, i16 256, i16 -256, i16 512, i16 -512, i16 1024, i16 -1024, i16 2048>
-; CHECK-NEXT: [[S:%.*]] = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> [[A0:%.*]], <16 x i16> [[M]])
+; CHECK-NEXT: [[S:%.*]] = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
; CHECK-NEXT: ret <16 x i16> [[S]]
;
%m = or <16 x i16> %a1, <i16 0, i16 16, i16 -16, i16 32, i16 -32, i16 64, i16 -64, i16 128, i16 -128, i16 256, i16 -256, i16 512, i16 -512, i16 1024, i16 -1024, i16 2048>
@@ -1097,8 +1087,7 @@ define <32 x i16> @undef_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %pa
define <32 x i16> @demandedbit_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %a1) {
; CHECK-LABEL: @demandedbit_test_permvar_hi_512_mask(
-; CHECK-NEXT: [[M:%.*]] = or <32 x i16> [[A1:%.*]], <i16 0, i16 32, i16 -32, i16 64, i16 -64, i16 128, i16 -128, i16 256, i16 -256, i16 512, i16 -512, i16 1024, i16 -1024, i16 2048, i16 -2048, i16 4096, i16 0, i16 32, i16 -32, i16 64, i16 -64, i16 128, i16 -128, i16 256, i16 -256, i16 512, i16 -512, i16 1024, i16 -1024, i16 2048, i16 -2048, i16 4096>
-; CHECK-NEXT: [[S:%.*]] = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> [[A0:%.*]], <32 x i16> [[M]])
+; CHECK-NEXT: [[S:%.*]] = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> [[A0:%.*]], <32 x i16> [[A1:%.*]])
; CHECK-NEXT: ret <32 x i16> [[S]]
;
%m = or <32 x i16> %a1, <i16 0, i16 32, i16 -32, i16 64, i16 -64, i16 128, i16 -128, i16 256, i16 -256, i16 512, i16 -512, i16 1024, i16 -1024, i16 2048, i16 -2048, i16 4096, i16 0, i16 32, i16 -32, i16 64, i16 -64, i16 128, i16 -128, i16 256, i16 -256, i16 512, i16 -512, i16 1024, i16 -1024, i16 2048, i16 -2048, i16 4096>
@@ -1196,8 +1185,7 @@ define <16 x i8> @undef_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passt
define <16 x i8> @demandedbit_test_permvar_qi_129_mask(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: @demandedbit_test_permvar_qi_129_mask(
-; CHECK-NEXT: [[M:%.*]] = or <16 x i8> [[A1:%.*]], <i8 0, i8 16, i8 -16, i8 32, i8 -32, i8 64, i8 -64, i8 -128, i8 -128, i8 0, i8 16, i8 -16, i8 32, i8 -32, i8 64, i8 -64>
-; CHECK-NEXT: [[S:%.*]] = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> [[A0:%.*]], <16 x i8> [[M]])
+; CHECK-NEXT: [[S:%.*]] = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> [[A0:%.*]], <16 x i8> [[A1:%.*]])
; CHECK-NEXT: ret <16 x i8> [[S]]
;
%m = or <16 x i8> %a1, <i8 0, i8 16, i8 -16, i8 32, i8 -32, i8 64, i8 -64, i8 128, i8 -128, i8 0, i8 16, i8 -16, i8 32, i8 -32, i8 64, i8 -64>
@@ -1295,8 +1283,7 @@ define <32 x i8> @undef_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passt
define <32 x i8> @demandedbit_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %a1) {
; CHECK-LABEL: @demandedbit_test_permvar_qi_256_mask(
-; CHECK-NEXT: [[M:%.*]] = or <32 x i8> [[A1:%.*]], <i8 0, i8 32, i8 -32, i8 64, i8 -64, i8 -128, i8 -128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 32, i8 -32, i8 64, i8 -64, i8 -128, i8 -128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
-; CHECK-NEXT: [[S:%.*]] = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> [[A0:%.*]], <32 x i8> [[M]])
+; CHECK-NEXT: [[S:%.*]] = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]])
; CHECK-NEXT: ret <32 x i8> [[S]]
;
%m = or <32 x i8> %a1, <i8 0, i8 32, i8 -32, i8 64, i8 -64, i8 128, i8 -128, i8 256, i8 -256, i8 512, i8 -512, i8 1024, i8 -1024, i8 2048, i8 -2048, i8 4096, i8 0, i8 32, i8 -32, i8 64, i8 -64, i8 128, i8 -128, i8 256, i8 -256, i8 512, i8 -512, i8 1024, i8 -1024, i8 2048, i8 -2048, i8 4096>
@@ -1394,8 +1381,7 @@ define <64 x i8> @undef_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passt
define <64 x i8> @demandedbit_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %a1) {
; CHECK-LABEL: @demandedbit_test_permvar_qi_512_mask(
-; CHECK-NEXT: [[M:%.*]] = or <64 x i8> [[A1:%.*]], <i8 0, i8 64, i8 -64, i8 -128, i8 -128, i8 0, i8 64, i8 -64, i8 -128, i8 -128, i8 0, i8 64, i8 -64, i8 -128, i8 -128, i8 0, i8 64, i8 -64, i8 -128, i8 -128, i8 0, i8 64, i8 -64, i8 -128, i8 -128, i8 0, i8 64, i8 -64, i8 -128, i8 -128, i8 0, i8 64, i8 -64, i8 -128, i8 -128, i8 0, i8 64, i8 -64, i8 -128, i8 -128, i8 0, i8 64, i8 -64, i8 -128, i8 -128, i8 0, i8 64, i8 -64, i8 -128, i8 -128, i8 0, i8 64, i8 -64, i8 -128, i8 -128, i8 0, i8 64, i8 -64, i8 -128, i8 -128, i8 0, i8 64, i8 -64, i8 -128>
-; CHECK-NEXT: [[S:%.*]] = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> [[A0:%.*]], <64 x i8> [[M]])
+; CHECK-NEXT: [[S:%.*]] = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> [[A0:%.*]], <64 x i8> [[A1:%.*]])
; CHECK-NEXT: ret <64 x i8> [[S]]
;
%m = or <64 x i8> %a1, <i8 0, i8 64, i8 -64, i8 128, i8 -128, i8 0, i8 64, i8 -64, i8 128, i8 -128, i8 0, i8 64, i8 -64, i8 128, i8 -128, i8 0, i8 64, i8 -64, i8 128, i8 -128, i8 0, i8 64, i8 -64, i8 128, i8 -128, i8 0, i8 64, i8 -64, i8 128, i8 -128, i8 0, i8 64, i8 -64, i8 128, i8 -128, i8 0, i8 64, i8 -64, i8 128, i8 -128, i8 0, i8 64, i8 -64, i8 128, i8 -128, i8 0, i8 64, i8 -64, i8 128, i8 -128, i8 0, i8 64, i8 -64, i8 128, i8 -128, i8 0, i8 64, i8 -64, i8 128, i8 -128, i8 0, i8 64, i8 -64, i8 128>
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-vpermi2.ll b/llvm/test/Transforms/InstCombine/X86/x86-vpermi2.ll
index eb6ad4458d932e..d38e48ac817aee 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-vpermi2.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-vpermi2.ll
@@ -28,8 +28,7 @@ define <2 x i64> @shuffle_vpermv3_v2i64_unary(<2 x i64> %x0) {
define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %m) {
; CHECK-LABEL: define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits(
; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[M:%.*]]) {
-; CHECK-NEXT: [[T:%.*]] = or <2 x i64> [[M]], <i64 0, i64 4>
-; CHECK-NEXT: [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[T]], <2 x i64> [[X1]])
+; CHECK-NEXT: [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[M]], <2 x i64> [[X1]])
; CHECK-NEXT: ret <2 x i64> [[R]]
;
%t = or <2 x i64> %m, <i64 0, i64 4>
@@ -72,8 +71,7 @@ define <4 x i64> @shuffle_vpermv3_v4i64_unary(<4 x i64> %x0) {
define <4 x i64> @shuffle_vpermv3_v4i64_demandedbits(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %m) {
; CHECK-LABEL: define <4 x i64> @shuffle_vpermv3_v4i64_demandedbits(
; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[M:%.*]]) {
-; CHECK-NEXT: [[T:%.*]] = or <4 x i64> [[M]], <i64 0, i64 8, i64 16, i64 32>
-; CHECK-NEXT: [[R:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> [[T]], <4 x i64> [[X1]])
+; CHECK-NEXT: [[R:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> [[M]], <4 x i64> [[X1]])
; CHECK-NEXT: ret <4 x i64> [[R]]
;
%t = or <4 x i64> %m, <i64 0, i64 8, i64 16, i64 32>
@@ -104,8 +102,7 @@ define <8 x i64> @shuffle_vpermv3_v8i64_unary(<8 x i64> %x0) {
define <8 x i64> @shuffle_vpermv3_v8i64_demandedbits(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %m) {
; CHECK-LABEL: define <8 x i64> @shuffle_vpermv3_v8i64_demandedbits(
; CHECK-SAME: <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[M:%.*]]) {
-; CHECK-NEXT: [[T:%.*]] = or <8 x i64> [[M]], <i64 0, i64 16, i64 32, i64 64, i64 256, i64 512, i64 1024, i64 -16>
-; CHECK-NEXT: [[R:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0]], <8 x i64> [[T]], <8 x i64> [[X1]])
+; CHECK-NEXT: [[R:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0]], <8 x i64> [[M]], <8 x i64> [[X1]])
; CHECK-NEXT: ret <8 x i64> [[R]]
;
%t = or <8 x i64> %m, <i64 0, i64 16, i64 32, i64 64, i64 256, i64 512, i64 1024, i64 -16>
@@ -140,8 +137,7 @@ define <4 x i32> @shuffle_vpermv3_v4i32_unary(<4 x i32> %x0) {
define <4 x i32> @shuffle_vpermv3_v4i32_demandedbits(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %m) {
; CHECK-LABEL: define <4 x i32> @shuffle_vpermv3_v4i32_demandedbits(
; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[M:%.*]]) {
-; CHECK-NEXT: [[T:%.*]] = or <4 x i32> [[M]], <i32 0, i32 8, i32 16, i32 32>
-; CHECK-NEXT: [[R:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> [[T]], <4 x i32> [[X1]])
+; CHECK-NEXT: [[R:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> [[M]], <4 x i32> [[X1]])
; CHECK-NEXT: ret <4 x i32> [[R]]
;
%t = or <4 x i32> %m, <i32 0, i32 8, i32 16, i32 32>
@@ -172,8 +168,7 @@ define <8 x i32> @shuffle_vpermv3_v8i32_unary(<8 x i32> %x0) {
define <8 x i32> @shuffle_vpermv3_v8i32_demandedbits(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %m) {
; CHECK-LABEL: define <8 x i32> @shuffle_vpermv3_v8i32_demandedbits(
; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[M:%.*]]) {
-; CHECK-NEXT: [[T:%.*]] = or <8 x i32> [[M]], <i32 0, i32 16, i32 32, i32 64, i32 256, i32 512, i32 -16, i32 -32>
-; CHECK-NEXT: [[R:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> [[T]], <8 x i32> [[X1]])
+; CHECK-NEXT: [[R:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> [[M]], <8 x i32> [[X1]])
; CHECK-NEXT: ret <8 x i32> [[R]]
;
%t = or <8 x i32> %m, <i32 0, i32 16, i32 32, i32 64, i32 256, i32 512, i32 -16, i32 -32>
@@ -204,8 +199,7 @@ define <16 x i32> @shuffle_vpermv3_v16i32_unary(<16 x i32> %x0) {
define <16 x i32> @shuffle_vpermv3_v16i32_demandedbits(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %m) {
; CHECK-LABEL: define <16 x i32> @shuffle_vpermv3_v16i32_demandedbits(
; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[M:%.*]]) {
-; CHECK-NEXT: [[T:%.*]] = or <16 x i32> [[M]], <i32 0, i32 32, i32 64, i32 256, i32 512, i32 1024, i32 2048, i32 4096, i32 8192, i32 -32, i32 -64, i32 -128, i32 -256, i32 -512, i32 -1024, i32 -2048>
-; CHECK-NEXT: [[R:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0]], <16 x i32> [[T]], <16 x i32> [[X1]])
+; CHECK-NEXT: [[R:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0]], <16 x i32> [[M]], <16 x i32> [[X1]])
; CHECK-NEXT: ret <16 x i32> [[R]]
;
%t = or <16 x i32> %m, <i32 0, i32 32, i32 64, i32 256, i32 512, i32 1024, i32 2048, i32 4096, i32 8192, i32 -32, i32 -64, i32 -128, i32 -256, i32 -512, i32 -1024, i32 -2048>
@@ -240,8 +234,7 @@ define <8 x i16> @shuffle_vpermv3_v8i16_unary(<8 x i16> %x0) {
define <8 x i16> @shuffle_vpermv3_v8i16_demandedbits(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %m) {
; CHECK-LABEL: define <8 x i16> @shuffle_vpermv3_v8i16_demandedbits(
; CHECK-SAME: <8 x i16> [[X0:%.*]], <8 x i16> [[X1:%.*]], <8 x i16> [[M:%.*]]) {
-; CHECK-NEXT: [[T:%.*]] = or <8 x ...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM w/ a few style nits
3207c8b
to
3cc3cb3
Compare
Noticed in clang-formatting of #106750
…V3 mask values VPERMV/VPERMV3 only uses the lower bits of the vector element indices - so use SimplifyDemandedBits to ignore anything touching the remaining bits. Fixes llvm#106413
9e96078
to
8c83fd8
Compare
VPERMV/VPERMV3 only uses the lower bits of the vector element indices - so use SimplifyDemandedBits to ignore anything touching the remaining bits.
Fixes #106413