diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp index a62fb7f723cdb..77139f38c977b 100644 --- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp +++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp @@ -2144,6 +2144,22 @@ static Value *simplifyX86vpermv3(const IntrinsicInst &II, return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, Size)); } +// Simplify VPERMV/VPERMV3 mask - only demand the active index bits. +static bool simplifyX86VPERMMask(Instruction *II, bool IsBinary, + InstCombiner &IC) { + auto *VecTy = cast(II->getType()); + unsigned EltSizeInBits = VecTy->getScalarSizeInBits(); + unsigned NumElts = VecTy->getNumElements(); + assert(isPowerOf2_32(NumElts) && isPowerOf2_32(EltSizeInBits) && + "Unexpected shuffle mask size"); + + unsigned IdxSizeInBits = Log2_32(IsBinary ? (2 * NumElts) : NumElts); + APInt DemandedMask = APInt::getLowBitsSet(EltSizeInBits, IdxSizeInBits); + + KnownBits KnownMask(EltSizeInBits); + return IC.SimplifyDemandedBits(II, /*OpNo=*/1, DemandedMask, KnownMask); +} + std::optional X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width, @@ -3004,6 +3020,8 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { if (Value *V = simplifyX86vpermv(II, IC.Builder)) { return IC.replaceInstUsesWith(II, V); } + if (simplifyX86VPERMMask(&II, /*IsBinary=*/false, IC)) + return &II; break; case Intrinsic::x86_avx512_vpermi2var_d_128: @@ -3027,6 +3045,8 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { if (Value *V = simplifyX86vpermv3(II, IC.Builder)) { return IC.replaceInstUsesWith(II, V); } + if (simplifyX86VPERMMask(&II, /*IsBinary=*/true, IC)) + return &II; break; case Intrinsic::x86_avx_maskload_ps: diff --git a/llvm/test/Transforms/InstCombine/X86/x86-vperm.ll b/llvm/test/Transforms/InstCombine/X86/x86-vperm.ll index 6519e4f534848..9a52c62d01404 100644 --- a/llvm/test/Transforms/InstCombine/X86/x86-vperm.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-vperm.ll @@ -91,8 +91,7 @@ define <8 x i32> @undef_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passt define <8 x i32> @demandedbit_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: @demandedbit_test_permvar_si_256_mask( -; CHECK-NEXT: [[M:%.*]] = or <8 x i32> [[A1:%.*]], -; CHECK-NEXT: [[S:%.*]] = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> [[A0:%.*]], <8 x i32> [[M]]) +; CHECK-NEXT: [[S:%.*]] = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> [[A0:%.*]], <8 x i32> [[A1:%.*]]) ; CHECK-NEXT: ret <8 x i32> [[S]] ; %m = or <8 x i32> %a1, @@ -190,8 +189,7 @@ define <8 x float> @undef_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> define <8 x float> @demandedbit_test_permvar_sf_256_mask(<8 x float> %a0, <8 x i32> %a1) { ; CHECK-LABEL: @demandedbit_test_permvar_sf_256_mask( -; CHECK-NEXT: [[M:%.*]] = or <8 x i32> [[A1:%.*]], -; CHECK-NEXT: [[S:%.*]] = call <8 x float> @llvm.x86.avx2.permps(<8 x float> [[A0:%.*]], <8 x i32> [[M]]) +; CHECK-NEXT: [[S:%.*]] = call <8 x float> @llvm.x86.avx2.permps(<8 x float> [[A0:%.*]], <8 x i32> [[A1:%.*]]) ; CHECK-NEXT: ret <8 x float> [[S]] ; %m = or <8 x i32> %a1, @@ -297,8 +295,7 @@ define <4 x i64> @undef_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passt define <4 x i64> @demandedbits_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: @demandedbits_test_permvar_di_256_mask( -; CHECK-NEXT: [[M:%.*]] = or <4 x i64> [[A1:%.*]], -; CHECK-NEXT: [[S:%.*]] = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> [[A0:%.*]], <4 x i64> [[M]]) +; CHECK-NEXT: [[S:%.*]] = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> [[A0:%.*]], <4 x i64> [[A1:%.*]]) ; CHECK-NEXT: ret <4 x i64> [[S]] ; %m = or <4 x i64> %a1, @@ -404,8 +401,7 @@ define <4 x double> @undef_test_permvar_df_256_mask(<4 x double> %a0, <4 x doubl define <4 x double> @demandedbits_test_permvar_df_256_mask(<4 x double> %a0, <4 x i64> %a1) { ; CHECK-LABEL: @demandedbits_test_permvar_df_256_mask( -; CHECK-NEXT: [[M:%.*]] = or <4 x i64> [[A1:%.*]], -; CHECK-NEXT: [[S:%.*]] = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> [[A0:%.*]], <4 x i64> [[M]]) +; CHECK-NEXT: [[S:%.*]] = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> [[A0:%.*]], <4 x i64> [[A1:%.*]]) ; CHECK-NEXT: ret <4 x double> [[S]] ; %m = or <4 x i64> %a1, @@ -503,8 +499,7 @@ define <16 x i32> @undef_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %pa define <16 x i32> @demandedbit_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: @demandedbit_test_permvar_si_512_mask( -; CHECK-NEXT: [[M:%.*]] = or <16 x i32> [[A1:%.*]], -; CHECK-NEXT: [[S:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[A0:%.*]], <16 x i32> [[M]]) +; CHECK-NEXT: [[S:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1:%.*]]) ; CHECK-NEXT: ret <16 x i32> [[S]] ; %m = or <16 x i32> %a1, @@ -602,8 +597,7 @@ define <16 x float> @undef_test_permvar_sf_512_mask(<16 x float> %a0, <16 x floa define <16 x float> @demandedbit_test_permvar_sf_512_mask(<16 x float> %a0, <16 x i32> %a1) { ; CHECK-LABEL: @demandedbit_test_permvar_sf_512_mask( -; CHECK-NEXT: [[M:%.*]] = or <16 x i32> [[A1:%.*]], -; CHECK-NEXT: [[S:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[A0:%.*]], <16 x i32> [[M]]) +; CHECK-NEXT: [[S:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[A0:%.*]], <16 x i32> [[A1:%.*]]) ; CHECK-NEXT: ret <16 x float> [[S]] ; %m = or <16 x i32> %a1, @@ -701,8 +695,7 @@ define <8 x i64> @undef_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passt define <8 x i64> @demandedbit_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: @demandedbit_test_permvar_di_512_mask( -; CHECK-NEXT: [[M:%.*]] = or <8 x i64> [[A1:%.*]], -; CHECK-NEXT: [[S:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[A0:%.*]], <8 x i64> [[M]]) +; CHECK-NEXT: [[S:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1:%.*]]) ; CHECK-NEXT: ret <8 x i64> [[S]] ; %m = or <8 x i64> %a1, @@ -800,8 +793,7 @@ define <8 x double> @undef_test_permvar_df_512_mask(<8 x double> %a0, <8 x doubl define <8 x double> @demandedbit_test_permvar_df_512_mask(<8 x double> %a0, <8 x i64> %a1) { ; CHECK-LABEL: @demandedbit_test_permvar_df_512_mask( -; CHECK-NEXT: [[M:%.*]] = or <8 x i64> [[A1:%.*]], -; CHECK-NEXT: [[S:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[A0:%.*]], <8 x i64> [[M]]) +; CHECK-NEXT: [[S:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[A0:%.*]], <8 x i64> [[A1:%.*]]) ; CHECK-NEXT: ret <8 x double> [[S]] ; %m = or <8 x i64> %a1, @@ -899,8 +891,7 @@ define <8 x i16> @undef_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passt define <8 x i16> @demandedbit_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: @demandedbit_test_permvar_hi_128_mask( -; CHECK-NEXT: [[M:%.*]] = or <8 x i16> [[A1:%.*]], -; CHECK-NEXT: [[S:%.*]] = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> [[A0:%.*]], <8 x i16> [[M]]) +; CHECK-NEXT: [[S:%.*]] = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]]) ; CHECK-NEXT: ret <8 x i16> [[S]] ; %m = or <8 x i16> %a1, @@ -998,8 +989,7 @@ define <16 x i16> @undef_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %pa define <16 x i16> @demandedbit_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: @demandedbit_test_permvar_hi_256_mask( -; CHECK-NEXT: [[M:%.*]] = or <16 x i16> [[A1:%.*]], -; CHECK-NEXT: [[S:%.*]] = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> [[A0:%.*]], <16 x i16> [[M]]) +; CHECK-NEXT: [[S:%.*]] = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]]) ; CHECK-NEXT: ret <16 x i16> [[S]] ; %m = or <16 x i16> %a1, @@ -1097,8 +1087,7 @@ define <32 x i16> @undef_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %pa define <32 x i16> @demandedbit_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: @demandedbit_test_permvar_hi_512_mask( -; CHECK-NEXT: [[M:%.*]] = or <32 x i16> [[A1:%.*]], -; CHECK-NEXT: [[S:%.*]] = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> [[A0:%.*]], <32 x i16> [[M]]) +; CHECK-NEXT: [[S:%.*]] = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> [[A0:%.*]], <32 x i16> [[A1:%.*]]) ; CHECK-NEXT: ret <32 x i16> [[S]] ; %m = or <32 x i16> %a1, @@ -1196,8 +1185,7 @@ define <16 x i8> @undef_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passt define <16 x i8> @demandedbit_test_permvar_qi_129_mask(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: @demandedbit_test_permvar_qi_129_mask( -; CHECK-NEXT: [[M:%.*]] = or <16 x i8> [[A1:%.*]], -; CHECK-NEXT: [[S:%.*]] = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> [[A0:%.*]], <16 x i8> [[M]]) +; CHECK-NEXT: [[S:%.*]] = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> [[A0:%.*]], <16 x i8> [[A1:%.*]]) ; CHECK-NEXT: ret <16 x i8> [[S]] ; %m = or <16 x i8> %a1, @@ -1295,8 +1283,7 @@ define <32 x i8> @undef_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passt define <32 x i8> @demandedbit_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: @demandedbit_test_permvar_qi_256_mask( -; CHECK-NEXT: [[M:%.*]] = or <32 x i8> [[A1:%.*]], -; CHECK-NEXT: [[S:%.*]] = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> [[A0:%.*]], <32 x i8> [[M]]) +; CHECK-NEXT: [[S:%.*]] = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]]) ; CHECK-NEXT: ret <32 x i8> [[S]] ; %m = or <32 x i8> %a1, @@ -1394,8 +1381,7 @@ define <64 x i8> @undef_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passt define <64 x i8> @demandedbit_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %a1) { ; CHECK-LABEL: @demandedbit_test_permvar_qi_512_mask( -; CHECK-NEXT: [[M:%.*]] = or <64 x i8> [[A1:%.*]], -; CHECK-NEXT: [[S:%.*]] = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> [[A0:%.*]], <64 x i8> [[M]]) +; CHECK-NEXT: [[S:%.*]] = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> [[A0:%.*]], <64 x i8> [[A1:%.*]]) ; CHECK-NEXT: ret <64 x i8> [[S]] ; %m = or <64 x i8> %a1, diff --git a/llvm/test/Transforms/InstCombine/X86/x86-vpermi2.ll b/llvm/test/Transforms/InstCombine/X86/x86-vpermi2.ll index eb6ad4458d932..d38e48ac817ae 100644 --- a/llvm/test/Transforms/InstCombine/X86/x86-vpermi2.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-vpermi2.ll @@ -28,8 +28,7 @@ define <2 x i64> @shuffle_vpermv3_v2i64_unary(<2 x i64> %x0) { define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %m) { ; CHECK-LABEL: define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits( ; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[M:%.*]]) { -; CHECK-NEXT: [[T:%.*]] = or <2 x i64> [[M]], -; CHECK-NEXT: [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[T]], <2 x i64> [[X1]]) +; CHECK-NEXT: [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[M]], <2 x i64> [[X1]]) ; CHECK-NEXT: ret <2 x i64> [[R]] ; %t = or <2 x i64> %m, @@ -72,8 +71,7 @@ define <4 x i64> @shuffle_vpermv3_v4i64_unary(<4 x i64> %x0) { define <4 x i64> @shuffle_vpermv3_v4i64_demandedbits(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %m) { ; CHECK-LABEL: define <4 x i64> @shuffle_vpermv3_v4i64_demandedbits( ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[M:%.*]]) { -; CHECK-NEXT: [[T:%.*]] = or <4 x i64> [[M]], -; CHECK-NEXT: [[R:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> [[T]], <4 x i64> [[X1]]) +; CHECK-NEXT: [[R:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> [[M]], <4 x i64> [[X1]]) ; CHECK-NEXT: ret <4 x i64> [[R]] ; %t = or <4 x i64> %m, @@ -104,8 +102,7 @@ define <8 x i64> @shuffle_vpermv3_v8i64_unary(<8 x i64> %x0) { define <8 x i64> @shuffle_vpermv3_v8i64_demandedbits(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %m) { ; CHECK-LABEL: define <8 x i64> @shuffle_vpermv3_v8i64_demandedbits( ; CHECK-SAME: <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[M:%.*]]) { -; CHECK-NEXT: [[T:%.*]] = or <8 x i64> [[M]], -; CHECK-NEXT: [[R:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0]], <8 x i64> [[T]], <8 x i64> [[X1]]) +; CHECK-NEXT: [[R:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0]], <8 x i64> [[M]], <8 x i64> [[X1]]) ; CHECK-NEXT: ret <8 x i64> [[R]] ; %t = or <8 x i64> %m, @@ -140,8 +137,7 @@ define <4 x i32> @shuffle_vpermv3_v4i32_unary(<4 x i32> %x0) { define <4 x i32> @shuffle_vpermv3_v4i32_demandedbits(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %m) { ; CHECK-LABEL: define <4 x i32> @shuffle_vpermv3_v4i32_demandedbits( ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[M:%.*]]) { -; CHECK-NEXT: [[T:%.*]] = or <4 x i32> [[M]], -; CHECK-NEXT: [[R:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> [[T]], <4 x i32> [[X1]]) +; CHECK-NEXT: [[R:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> [[M]], <4 x i32> [[X1]]) ; CHECK-NEXT: ret <4 x i32> [[R]] ; %t = or <4 x i32> %m, @@ -172,8 +168,7 @@ define <8 x i32> @shuffle_vpermv3_v8i32_unary(<8 x i32> %x0) { define <8 x i32> @shuffle_vpermv3_v8i32_demandedbits(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %m) { ; CHECK-LABEL: define <8 x i32> @shuffle_vpermv3_v8i32_demandedbits( ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[M:%.*]]) { -; CHECK-NEXT: [[T:%.*]] = or <8 x i32> [[M]], -; CHECK-NEXT: [[R:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> [[T]], <8 x i32> [[X1]]) +; CHECK-NEXT: [[R:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> [[M]], <8 x i32> [[X1]]) ; CHECK-NEXT: ret <8 x i32> [[R]] ; %t = or <8 x i32> %m, @@ -204,8 +199,7 @@ define <16 x i32> @shuffle_vpermv3_v16i32_unary(<16 x i32> %x0) { define <16 x i32> @shuffle_vpermv3_v16i32_demandedbits(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %m) { ; CHECK-LABEL: define <16 x i32> @shuffle_vpermv3_v16i32_demandedbits( ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[M:%.*]]) { -; CHECK-NEXT: [[T:%.*]] = or <16 x i32> [[M]], -; CHECK-NEXT: [[R:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0]], <16 x i32> [[T]], <16 x i32> [[X1]]) +; CHECK-NEXT: [[R:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0]], <16 x i32> [[M]], <16 x i32> [[X1]]) ; CHECK-NEXT: ret <16 x i32> [[R]] ; %t = or <16 x i32> %m, @@ -240,8 +234,7 @@ define <8 x i16> @shuffle_vpermv3_v8i16_unary(<8 x i16> %x0) { define <8 x i16> @shuffle_vpermv3_v8i16_demandedbits(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %m) { ; CHECK-LABEL: define <8 x i16> @shuffle_vpermv3_v8i16_demandedbits( ; CHECK-SAME: <8 x i16> [[X0:%.*]], <8 x i16> [[X1:%.*]], <8 x i16> [[M:%.*]]) { -; CHECK-NEXT: [[T:%.*]] = or <8 x i16> [[M]], -; CHECK-NEXT: [[R:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[X0]], <8 x i16> [[T]], <8 x i16> [[X1]]) +; CHECK-NEXT: [[R:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[X0]], <8 x i16> [[M]], <8 x i16> [[X1]]) ; CHECK-NEXT: ret <8 x i16> [[R]] ; %t = or <8 x i16> %m, @@ -272,8 +265,7 @@ define <16 x i16> @shuffle_vpermv3_v16i16_unary(<16 x i16> %x0) { define <16 x i16> @shuffle_vpermv3_v16i16_demandedbits(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %m) { ; CHECK-LABEL: define <16 x i16> @shuffle_vpermv3_v16i16_demandedbits( ; CHECK-SAME: <16 x i16> [[X0:%.*]], <16 x i16> [[X1:%.*]], <16 x i16> [[M:%.*]]) { -; CHECK-NEXT: [[T:%.*]] = or <16 x i16> [[M]], -; CHECK-NEXT: [[R:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[X0]], <16 x i16> [[T]], <16 x i16> [[X1]]) +; CHECK-NEXT: [[R:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[X0]], <16 x i16> [[M]], <16 x i16> [[X1]]) ; CHECK-NEXT: ret <16 x i16> [[R]] ; %t = or <16 x i16> %m, @@ -304,8 +296,7 @@ define <32 x i16> @shuffle_vpermv3_v32i16_unary(<32 x i16> %x0) { define <32 x i16> @shuffle_vpermv3_v32i16_demandedbits(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %m) { ; CHECK-LABEL: define <32 x i16> @shuffle_vpermv3_v32i16_demandedbits( ; CHECK-SAME: <32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]], <32 x i16> [[M:%.*]]) { -; CHECK-NEXT: [[T:%.*]] = or <32 x i16> [[M]], -; CHECK-NEXT: [[R:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X0]], <32 x i16> [[T]], <32 x i16> [[X1]]) +; CHECK-NEXT: [[R:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X0]], <32 x i16> [[M]], <32 x i16> [[X1]]) ; CHECK-NEXT: ret <32 x i16> [[R]] ; %t = or <32 x i16> %m, @@ -340,8 +331,7 @@ define <16 x i8> @shuffle_vpermv3_v16i8_unary(<16 x i8> %x0) { define <16 x i8> @shuffle_vpermv3_v16i8_demandedbits(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %m) { ; CHECK-LABEL: define <16 x i8> @shuffle_vpermv3_v16i8_demandedbits( ; CHECK-SAME: <16 x i8> [[X0:%.*]], <16 x i8> [[X1:%.*]], <16 x i8> [[M:%.*]]) { -; CHECK-NEXT: [[T:%.*]] = or <16 x i8> [[M]], -; CHECK-NEXT: [[R:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[X0]], <16 x i8> [[T]], <16 x i8> [[X1]]) +; CHECK-NEXT: [[R:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[X0]], <16 x i8> [[M]], <16 x i8> [[X1]]) ; CHECK-NEXT: ret <16 x i8> [[R]] ; %t = or <16 x i8> %m, @@ -372,8 +362,7 @@ define <32 x i8> @shuffle_vpermv3_v32i8_unary(<32 x i8> %x0) { define <32 x i8> @shuffle_vpermv3_v32i8_demandedbits(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %m) { ; CHECK-LABEL: define <32 x i8> @shuffle_vpermv3_v32i8_demandedbits( ; CHECK-SAME: <32 x i8> [[X0:%.*]], <32 x i8> [[X1:%.*]], <32 x i8> [[M:%.*]]) { -; CHECK-NEXT: [[T:%.*]] = or <32 x i8> [[M]], -; CHECK-NEXT: [[R:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[X0]], <32 x i8> [[T]], <32 x i8> [[X1]]) +; CHECK-NEXT: [[R:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[X0]], <32 x i8> [[M]], <32 x i8> [[X1]]) ; CHECK-NEXT: ret <32 x i8> [[R]] ; %t = or <32 x i8> %m, @@ -404,8 +393,7 @@ define <64 x i8> @shuffle_vpermv3_v64i8_unary(<64 x i8> %x0) { define <64 x i8> @shuffle_vpermv3_v64i8_demandedbits(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %m) { ; CHECK-LABEL: define <64 x i8> @shuffle_vpermv3_v64i8_demandedbits( ; CHECK-SAME: <64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]], <64 x i8> [[M:%.*]]) { -; CHECK-NEXT: [[T:%.*]] = or <64 x i8> [[M]], -; CHECK-NEXT: [[R:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[X0]], <64 x i8> [[T]], <64 x i8> [[X1]]) +; CHECK-NEXT: [[R:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[X0]], <64 x i8> [[M]], <64 x i8> [[X1]]) ; CHECK-NEXT: ret <64 x i8> [[R]] ; %t = or <64 x i8> %m,